Merge branch 'dmaengine' into async-tx-next

Conflicts: crypto/async_tx/async_xor.c drivers/dma/ioat/dma_v2.h drivers/dma/ioat/pci.c drivers/md/raid5.c
author: Dan Williams <dan.j.williams@intel.com> 2009-09-08 20:55:21 -0400
committer: Dan Williams <dan.j.williams@intel.com> 2009-09-08 20:55:21 -0400
commit: bbb20089a3275a19e475dbc21320c3742e3ca423 (patch)
tree: 216fdc1cbef450ca688135c5b8969169482d9a48 /drivers/md
parent: 3e48e656903e9fd8bc805c6a2c4264d7808d315b (diff)
parent: 657a77fa7284d8ae28dfa48f1dc5d919bf5b2843 (diff)
43 files changed, 3987 insertions, 1098 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 09c0c6e49ab5..2158377a1359 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -257,6 +257,17 @@ config DM_MIRROR
         Allow volume managers to mirror logical volumes, also
         needed for live data migration tools such as 'pvmove'.
+config DM_LOG_USERSPACE
+        tristate "Mirror userspace logging (EXPERIMENTAL)"
+        depends on DM_MIRROR && EXPERIMENTAL && NET
+        select CONNECTOR
+        ---help---
+          The userspace logging module provides a mechanism for
+          relaying the dm-dirty-log API to userspace.  Log designs
+          which are more suited to userspace implementation (e.g.
+          shared storage logs) or experimental logs can be implemented
+          by leveraging this framework.
 config DM_ZERO
        tristate "Zero target"
        depends on BLK_DEV_DM
@@ -275,6 +286,25 @@ config DM_MULTIPATH
        ---help---
          Allow volume managers to support multipath hardware.
+config DM_MULTIPATH_QL
+        tristate "I/O Path Selector based on the number of in-flight I/Os"
+        depends on DM_MULTIPATH
+        ---help---
+          This path selector is a dynamic load balancer which selects
+          the path with the least number of in-flight I/Os.
+          If unsure, say N.
+config DM_MULTIPATH_ST
+        tristate "I/O Path Selector based on the service time"
+        depends on DM_MULTIPATH
+        ---help---
+          This path selector is a dynamic load balancer which selects
+          the path expected to complete the incoming I/O in the shortest
+          time.
+          If unsure, say N.
 config DM_DELAY
        tristate "I/O delaying target (EXPERIMENTAL)"
        depends on BLK_DEV_DM && EXPERIMENTAL
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 45cc5951d928..1dc4185bd781 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -8,6 +8,8 @@ dm-multipath-y	+= dm-path-selector.o dm-mpath.o
 dm-snapshot-y   += dm-snap.o dm-exception-store.o dm-snap-transient.o \
                    dm-snap-persistent.o
 dm-mirror-y     += dm-raid1.o
+dm-log-userspace-y \
+                += dm-log-userspace-base.o dm-log-userspace-transfer.o
 md-mod-y        += md.o bitmap.o
 raid456-y       += raid5.o
 raid6_pq-y      += raid6algos.o raid6recov.o raid6tables.o \
@@ -36,8 +38,11 @@ obj-$(CONFIG_BLK_DEV_DM)	+= dm-mod.o
 obj-$(CONFIG_DM_CRYPT)          += dm-crypt.o
 obj-$(CONFIG_DM_DELAY)          += dm-delay.o
 obj-$(CONFIG_DM_MULTIPATH)      += dm-multipath.o dm-round-robin.o
+obj-$(CONFIG_DM_MULTIPATH_QL)   += dm-queue-length.o
+obj-$(CONFIG_DM_MULTIPATH_ST)   += dm-service-time.o
 obj-$(CONFIG_DM_SNAPSHOT)       += dm-snapshot.o
 obj-$(CONFIG_DM_MIRROR)         += dm-mirror.o dm-log.o dm-region-hash.o
+obj-$(CONFIG_DM_LOG_USERSPACE)  += dm-log-userspace.o
 obj-$(CONFIG_DM_ZERO)           += dm-zero.o
 quiet_cmd_unroll = UNROLL  $@
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 56df1cee8fb3..3319c2fec28e 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -232,7 +232,7 @@ static struct page *read_sb_page(mddev_t *mddev, long offset,
                target = rdev->sb_start + offset + index * (PAGE_SIZE/512);
                if (sync_page_io(rdev->bdev, target,
-                                 roundup(size, bdev_hardsect_size(rdev->bdev)),
+                                 roundup(size, bdev_logical_block_size(rdev->bdev)),
                                 page, READ)) {
                        page->index = index;
                        attach_page_buffers(page, NULL); /* so that free_buffer will
@@ -287,7 +287,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
                        int size = PAGE_SIZE;
                        if (page->index == bitmap->file_pages-1)
                                size = roundup(bitmap->last_page_size,
-                                               bdev_hardsect_size(rdev->bdev));
+                                               bdev_logical_block_size(rdev->bdev));
                        /* Just make sure we aren't corrupting data or
                         * metadata
                         */
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 53394e863c74..9933eb861c71 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1132,6 +1132,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                goto bad_crypt_queue;
        }
+        ti->num_flush_requests = 1;
        ti->private = cc;
        return 0;
@@ -1189,6 +1190,13 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
                     union map_info *map_context)
 {
        struct dm_crypt_io *io;
+        struct crypt_config *cc;
+        if (unlikely(bio_empty_barrier(bio))) {
+                cc = ti->private;
+                bio->bi_bdev = cc->dev->bdev;
+                return DM_MAPIO_REMAPPED;
+        }
        io = crypt_io_alloc(ti, bio, bio->bi_sector - ti->begin);
@@ -1305,9 +1313,17 @@ static int crypt_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
        return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
 }
+static int crypt_iterate_devices(struct dm_target *ti,
+                                 iterate_devices_callout_fn fn, void *data)
+{
+        struct crypt_config *cc = ti->private;
+        return fn(ti, cc->dev, cc->start, data);
+}
 static struct target_type crypt_target = {
        .name   = "crypt",
-        .version= {1, 6, 0},
+        .version = {1, 7, 0},
        .module = THIS_MODULE,
        .ctr    = crypt_ctr,
        .dtr    = crypt_dtr,
@@ -1318,6 +1334,7 @@ static struct target_type crypt_target = {
        .resume = crypt_resume,
        .message = crypt_message,
        .merge  = crypt_merge,
+        .iterate_devices = crypt_iterate_devices,
 };
 static int __init dm_crypt_init(void)
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 559dbb52bc85..4e5b843cd4d7 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -197,6 +197,7 @@ out:
        mutex_init(&dc->timer_lock);
        atomic_set(&dc->may_delay, 1);
+        ti->num_flush_requests = 1;
        ti->private = dc;
        return 0;
@@ -278,8 +279,9 @@ static int delay_map(struct dm_target *ti, struct bio *bio,
        if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) {
                bio->bi_bdev = dc->dev_write->bdev;
-                bio->bi_sector = dc->start_write +
+                if (bio_sectors(bio))
-                                 (bio->bi_sector - ti->begin);
+                        bio->bi_sector = dc->start_write +
+                                         (bio->bi_sector - ti->begin);
                return delay_bio(dc, dc->write_delay, bio);
        }
@@ -316,9 +318,26 @@ static int delay_status(struct dm_target *ti, status_type_t type,
        return 0;
 }
+static int delay_iterate_devices(struct dm_target *ti,
+                                 iterate_devices_callout_fn fn, void *data)
+{
+        struct delay_c *dc = ti->private;
+        int ret = 0;
+        ret = fn(ti, dc->dev_read, dc->start_read, data);
+        if (ret)
+                goto out;
+        if (dc->dev_write)
+                ret = fn(ti, dc->dev_write, dc->start_write, data);
+out:
+        return ret;
+}
 static struct target_type delay_target = {
        .name        = "delay",
-        .version     = {1, 0, 2},
+        .version     = {1, 1, 0},
        .module      = THIS_MODULE,
        .ctr         = delay_ctr,
        .dtr         = delay_dtr,
@@ -326,6 +345,7 @@ static struct target_type delay_target = {
        .presuspend  = delay_presuspend,
        .resume      = delay_resume,
        .status      = delay_status,
+        .iterate_devices = delay_iterate_devices,
 };
 static int __init dm_delay_init(void)
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index a2e26c242141..c3ae51584b12 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -178,7 +178,7 @@ static int set_chunk_size(struct dm_exception_store *store,
        }
        /* Validate the chunk size against the device block size */
-        if (chunk_size_ulong % (bdev_hardsect_size(store->cow->bdev) >> 9)) {
+        if (chunk_size_ulong % (bdev_logical_block_size(store->cow->bdev) >> 9)) {
                *error = "Chunk size is not a multiple of device blocksize";
                return -EINVAL;
        }
@@ -216,7 +216,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
                return -EINVAL;
        }
-        type = get_type(argv[1]);
+        type = get_type(&persistent);
        if (!type) {
                ti->error = "Exception store type not recognised";
                r = -EINVAL;
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index 0a2e6e7f67b3..2442c8c07898 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -111,7 +111,7 @@ struct dm_exception_store {
 /*
 * Funtions to manipulate consecutive chunks
 */
-#  if defined(CONFIG_LBD) || (BITS_PER_LONG == 64)
+#  if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64)
 #    define DM_CHUNK_CONSECUTIVE_BITS 8
 #    define DM_CHUNK_NUMBER_BITS 56
@@ -156,7 +156,7 @@ static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
 */
 static inline sector_t get_dev_size(struct block_device *bdev)
 {
-        return bdev->bd_inode->i_size >> SECTOR_SHIFT;
+        return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
 }
 static inline chunk_t sector_to_chunk(struct dm_exception_store *store,
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index e73aabd61cd7..3a2e6a2f8bdd 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -22,6 +22,7 @@ struct dm_io_client {
 /* FIXME: can we shrink this ? */
 struct io {
        unsigned long error_bits;
+        unsigned long eopnotsupp_bits;
        atomic_t count;
        struct task_struct *sleeper;
        struct dm_io_client *client;
@@ -107,8 +108,11 @@ static inline unsigned bio_get_region(struct bio *bio)
 *---------------------------------------------------------------*/
 static void dec_count(struct io *io, unsigned int region, int error)
 {
-        if (error)
+        if (error) {
                set_bit(region, &io->error_bits);
+                if (error == -EOPNOTSUPP)
+                        set_bit(region, &io->eopnotsupp_bits);
+        }
        if (atomic_dec_and_test(&io->count)) {
                if (io->sleeper)
@@ -360,7 +364,9 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
                return -EIO;
        }
+retry:
        io.error_bits = 0;
+        io.eopnotsupp_bits = 0;
        atomic_set(&io.count, 1); /* see dispatch_io() */
        io.sleeper = current;
        io.client = client;
@@ -377,6 +383,11 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
        }
        set_current_state(TASK_RUNNING);
+        if (io.eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) {
+                rw &= ~(1 << BIO_RW_BARRIER);
+                goto retry;
+        }
        if (error_bits)
                *error_bits = io.error_bits;
@@ -397,6 +408,7 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
        io = mempool_alloc(client->pool, GFP_NOIO);
        io->error_bits = 0;
+        io->eopnotsupp_bits = 0;
        atomic_set(&io->count, 1); /* see dispatch_io() */
        io->sleeper = NULL;
        io->client = client;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 823ceba6efa8..7f77f18fcafa 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -276,7 +276,7 @@ retry:
        up_write(&_hash_lock);
 }
-static int dm_hash_rename(const char *old, const char *new)
+static int dm_hash_rename(uint32_t cookie, const char *old, const char *new)
 {
        char *new_name, *old_name;
        struct hash_cell *hc;
@@ -333,7 +333,7 @@ static int dm_hash_rename(const char *old, const char *new)
                dm_table_put(table);
        }
-        dm_kobject_uevent(hc->md);
+        dm_kobject_uevent(hc->md, KOBJ_CHANGE, cookie);
        dm_put(hc->md);
        up_write(&_hash_lock);
@@ -680,6 +680,9 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
        __hash_remove(hc);
        up_write(&_hash_lock);
+        dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr);
        dm_put(md);
        param->data_size = 0;
        return 0;
@@ -715,7 +718,7 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
                return r;
        param->data_size = 0;
-        return dm_hash_rename(param->name, new_name);
+        return dm_hash_rename(param->event_nr, param->name, new_name);
 }
 static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
@@ -842,8 +845,11 @@ static int do_resume(struct dm_ioctl *param)
        if (dm_suspended(md))
                r = dm_resume(md);
-        if (!r)
+        if (!r) {
+                dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr);
                r = __dev_status(md, param);
+        }
        dm_put(md);
        return r;
@@ -1044,6 +1050,12 @@ static int populate_table(struct dm_table *table,
                next = spec->next;
        }
+        r = dm_table_set_type(table);
+        if (r) {
+                DMWARN("unable to set table type");
+                return r;
+        }
        return dm_table_complete(table);
 }
@@ -1089,6 +1101,13 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
                goto out;
        }
+        r = dm_table_alloc_md_mempools(t);
+        if (r) {
+                DMWARN("unable to allocate mempools for this table");
+                dm_table_destroy(t);
+                goto out;
+        }
        down_write(&_hash_lock);
        hc = dm_get_mdptr(md);
        if (!hc || hc->md != md) {
@@ -1513,6 +1532,7 @@ static const struct file_operations _ctl_fops = {
 static struct miscdevice _dm_misc = {
        .minor          = MISC_DYNAMIC_MINOR,
        .name           = DM_NAME,
+        .devnode        = "mapper/control",
        .fops           = &_ctl_fops
 };
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 79fb53e51c70..9184b6deb868 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -53,6 +53,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                goto bad;
        }
+        ti->num_flush_requests = 1;
        ti->private = lc;
        return 0;
@@ -81,7 +82,8 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio)
        struct linear_c *lc = ti->private;
        bio->bi_bdev = lc->dev->bdev;
-        bio->bi_sector = linear_map_sector(ti, bio->bi_sector);
+        if (bio_sectors(bio))
+                bio->bi_sector = linear_map_sector(ti, bio->bi_sector);
 }
 static int linear_map(struct dm_target *ti, struct bio *bio,
@@ -132,9 +134,17 @@ static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
        return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
 }
+static int linear_iterate_devices(struct dm_target *ti,
+                                  iterate_devices_callout_fn fn, void *data)
+{
+        struct linear_c *lc = ti->private;
+        return fn(ti, lc->dev, lc->start, data);
+}
 static struct target_type linear_target = {
        .name   = "linear",
-        .version= {1, 0, 3},
+        .version = {1, 1, 0},
        .module = THIS_MODULE,
        .ctr    = linear_ctr,
        .dtr    = linear_dtr,
@@ -142,6 +152,7 @@ static struct target_type linear_target = {
        .status = linear_status,
        .ioctl  = linear_ioctl,
        .merge  = linear_merge,
+        .iterate_devices = linear_iterate_devices,
 };
 int __init dm_linear_init(void)
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
new file mode 100644
index 000000000000..e69b96560997
--- /dev/null
+++ b/drivers/md/dm-log-userspace-base.c
@@ -0,0 +1,696 @@
+/*
+ * Copyright (C) 2006-2009 Red Hat, Inc.
+ *
+ * This file is released under the LGPL.
+ */
+#include <linux/bio.h>
+#include <linux/dm-dirty-log.h>
+#include <linux/device-mapper.h>
+#include <linux/dm-log-userspace.h>
+#include "dm-log-userspace-transfer.h"
+struct flush_entry {
+        int type;
+        region_t region;
+        struct list_head list;
+};
+struct log_c {
+        struct dm_target *ti;
+        uint32_t region_size;
+        region_t region_count;
+        char uuid[DM_UUID_LEN];
+        char *usr_argv_str;
+        uint32_t usr_argc;
+        /*
+         * in_sync_hint gets set when doing is_remote_recovering.  It
+         * represents the first region that needs recovery.  IOW, the
+         * first zero bit of sync_bits.  This can be useful for to limit
+         * traffic for calls like is_remote_recovering and get_resync_work,
+         * but be take care in its use for anything else.
+         */
+        uint64_t in_sync_hint;
+        spinlock_t flush_lock;
+        struct list_head flush_list;  /* only for clear and mark requests */
+};
+static mempool_t *flush_entry_pool;
+static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data)
+{
+        return kmalloc(sizeof(struct flush_entry), gfp_mask);
+}
+static void flush_entry_free(void *element, void *pool_data)
+{
+        kfree(element);
+}
+static int userspace_do_request(struct log_c *lc, const char *uuid,
+                                int request_type, char *data, size_t data_size,
+                                char *rdata, size_t *rdata_size)
+{
+        int r;
+        /*
+         * If the server isn't there, -ESRCH is returned,
+         * and we must keep trying until the server is
+         * restored.
+         */
+retry:
+        r = dm_consult_userspace(uuid, request_type, data,
+                                 data_size, rdata, rdata_size);
+        if (r != -ESRCH)
+                return r;
+        DMERR(" Userspace log server not found.");
+        while (1) {
+                set_current_state(TASK_INTERRUPTIBLE);
+                schedule_timeout(2*HZ);
+                DMWARN("Attempting to contact userspace log server...");
+                r = dm_consult_userspace(uuid, DM_ULOG_CTR, lc->usr_argv_str,
+                                         strlen(lc->usr_argv_str) + 1,
+                                         NULL, NULL);
+                if (!r)
+                        break;
+        }
+        DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete");
+        r = dm_consult_userspace(uuid, DM_ULOG_RESUME, NULL,
+                                 0, NULL, NULL);
+        if (!r)
+                goto retry;
+        DMERR("Error trying to resume userspace log: %d", r);
+        return -ESRCH;
+}
+static int build_constructor_string(struct dm_target *ti,
+                                    unsigned argc, char **argv,
+                                    char **ctr_str)
+{
+        int i, str_size;
+        char *str = NULL;
+        *ctr_str = NULL;
+        for (i = 0, str_size = 0; i < argc; i++)
+                str_size += strlen(argv[i]) + 1; /* +1 for space between args */
+        str_size += 20; /* Max number of chars in a printed u64 number */
+        str = kzalloc(str_size, GFP_KERNEL);
+        if (!str) {
+                DMWARN("Unable to allocate memory for constructor string");
+                return -ENOMEM;
+        }
+        for (i = 0, str_size = 0; i < argc; i++)
+                str_size += sprintf(str + str_size, "%s ", argv[i]);
+        str_size += sprintf(str + str_size, "%llu",
+                            (unsigned long long)ti->len);
+        *ctr_str = str;
+        return str_size;
+}
+/*
+ * userspace_ctr
+ *
+ * argv contains:
+ *      <UUID> <other args>
+ * Where 'other args' is the userspace implementation specific log
+ * arguments.  An example might be:
+ *      <UUID> clustered_disk <arg count> <log dev> <region_size> [[no]sync]
+ *
+ * So, this module will strip off the <UUID> for identification purposes
+ * when communicating with userspace about a log; but will pass on everything
+ * else.
+ */
+static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
+                         unsigned argc, char **argv)
+{
+        int r = 0;
+        int str_size;
+        char *ctr_str = NULL;
+        struct log_c *lc = NULL;
+        uint64_t rdata;
+        size_t rdata_size = sizeof(rdata);
+        if (argc < 3) {
+                DMWARN("Too few arguments to userspace dirty log");
+                return -EINVAL;
+        }
+        lc = kmalloc(sizeof(*lc), GFP_KERNEL);
+        if (!lc) {
+                DMWARN("Unable to allocate userspace log context.");
+                return -ENOMEM;
+        }
+        lc->ti = ti;
+        if (strlen(argv[0]) > (DM_UUID_LEN - 1)) {
+                DMWARN("UUID argument too long.");
+                kfree(lc);
+                return -EINVAL;
+        }
+        strncpy(lc->uuid, argv[0], DM_UUID_LEN);
+        spin_lock_init(&lc->flush_lock);
+        INIT_LIST_HEAD(&lc->flush_list);
+        str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str);
+        if (str_size < 0) {
+                kfree(lc);
+                return str_size;
+        }
+        /* Send table string */
+        r = dm_consult_userspace(lc->uuid, DM_ULOG_CTR,
+                                 ctr_str, str_size, NULL, NULL);
+        if (r == -ESRCH) {
+                DMERR("Userspace log server not found");
+                goto out;
+        }
+        /* Since the region size does not change, get it now */
+        rdata_size = sizeof(rdata);
+        r = dm_consult_userspace(lc->uuid, DM_ULOG_GET_REGION_SIZE,
+                                 NULL, 0, (char *)&rdata, &rdata_size);
+        if (r) {
+                DMERR("Failed to get region size of dirty log");
+                goto out;
+        }
+        lc->region_size = (uint32_t)rdata;
+        lc->region_count = dm_sector_div_up(ti->len, lc->region_size);
+out:
+        if (r) {
+                kfree(lc);
+                kfree(ctr_str);
+        } else {
+                lc->usr_argv_str = ctr_str;
+                lc->usr_argc = argc;
+                log->context = lc;
+        }
+        return r;
+}
+static void userspace_dtr(struct dm_dirty_log *log)
+{
+        int r;
+        struct log_c *lc = log->context;
+        r = dm_consult_userspace(lc->uuid, DM_ULOG_DTR,
+                                 NULL, 0,
+                                 NULL, NULL);
+        kfree(lc->usr_argv_str);
+        kfree(lc);
+        return;
+}
+static int userspace_presuspend(struct dm_dirty_log *log)
+{
+        int r;
+        struct log_c *lc = log->context;
+        r = dm_consult_userspace(lc->uuid, DM_ULOG_PRESUSPEND,
+                                 NULL, 0,
+                                 NULL, NULL);
+        return r;
+}
+static int userspace_postsuspend(struct dm_dirty_log *log)
+{
+        int r;
+        struct log_c *lc = log->context;
+        r = dm_consult_userspace(lc->uuid, DM_ULOG_POSTSUSPEND,
+                                 NULL, 0,
+                                 NULL, NULL);
+        return r;
+}
+static int userspace_resume(struct dm_dirty_log *log)
+{
+        int r;
+        struct log_c *lc = log->context;
+        lc->in_sync_hint = 0;
+        r = dm_consult_userspace(lc->uuid, DM_ULOG_RESUME,
+                                 NULL, 0,
+                                 NULL, NULL);
+        return r;
+}
+static uint32_t userspace_get_region_size(struct dm_dirty_log *log)
+{
+        struct log_c *lc = log->context;
+        return lc->region_size;
+}
+/*
+ * userspace_is_clean
+ *
+ * Check whether a region is clean.  If there is any sort of
+ * failure when consulting the server, we return not clean.
+ *
+ * Returns: 1 if clean, 0 otherwise
+ */
+static int userspace_is_clean(struct dm_dirty_log *log, region_t region)
+{
+        int r;
+        uint64_t region64 = (uint64_t)region;
+        int64_t is_clean;
+        size_t rdata_size;
+        struct log_c *lc = log->context;
+        rdata_size = sizeof(is_clean);
+        r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_CLEAN,
+                                 (char *)&region64, sizeof(region64),
+                                 (char *)&is_clean, &rdata_size);
+        return (r) ? 0 : (int)is_clean;
+}
+/*
+ * userspace_in_sync
+ *
+ * Check if the region is in-sync.  If there is any sort
+ * of failure when consulting the server, we assume that
+ * the region is not in sync.
+ *
+ * If 'can_block' is set, return immediately
+ *
+ * Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK
+ */
+static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
+                             int can_block)
+{
+        int r;
+        uint64_t region64 = region;
+        int64_t in_sync;
+        size_t rdata_size;
+        struct log_c *lc = log->context;
+        /*
+         * We can never respond directly - even if in_sync_hint is
+         * set.  This is because another machine could see a device
+         * failure and mark the region out-of-sync.  If we don't go
+         * to userspace to ask, we might think the region is in-sync
+         * and allow a read to pick up data that is stale.  (This is
+         * very unlikely if a device actually fails; but it is very
+         * likely if a connection to one device from one machine fails.)
+         *
+         * There still might be a problem if the mirror caches the region
+         * state as in-sync... but then this call would not be made.  So,
+         * that is a mirror problem.
+         */
+        if (!can_block)
+                return -EWOULDBLOCK;
+        rdata_size = sizeof(in_sync);
+        r = userspace_do_request(lc, lc->uuid, DM_ULOG_IN_SYNC,
+                                 (char *)&region64, sizeof(region64),
+                                 (char *)&in_sync, &rdata_size);
+        return (r) ? 0 : (int)in_sync;
+}
+/*
+ * userspace_flush
+ *
+ * This function is ok to block.
+ * The flush happens in two stages.  First, it sends all
+ * clear/mark requests that are on the list.  Then it
+ * tells the server to commit them.  This gives the
+ * server a chance to optimise the commit, instead of
+ * doing it for every request.
+ *
+ * Additionally, we could implement another thread that
+ * sends the requests up to the server - reducing the
+ * load on flush.  Then the flush would have less in
+ * the list and be responsible for the finishing commit.
+ *
+ * Returns: 0 on success, < 0 on failure
+ */
+static int userspace_flush(struct dm_dirty_log *log)
+{
+        int r = 0;
+        unsigned long flags;
+        struct log_c *lc = log->context;
+        LIST_HEAD(flush_list);
+        struct flush_entry *fe, *tmp_fe;
+        spin_lock_irqsave(&lc->flush_lock, flags);
+        list_splice_init(&lc->flush_list, &flush_list);
+        spin_unlock_irqrestore(&lc->flush_lock, flags);
+        if (list_empty(&flush_list))
+                return 0;
+        /*
+         * FIXME: Count up requests, group request types,
+         * allocate memory to stick all requests in and
+         * send to server in one go.  Failing the allocation,
+         * do it one by one.
+         */
+        list_for_each_entry(fe, &flush_list, list) {
+                r = userspace_do_request(lc, lc->uuid, fe->type,
+                                         (char *)&fe->region,
+                                         sizeof(fe->region),
+                                         NULL, NULL);
+                if (r)
+                        goto fail;
+        }
+        r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
+                                 NULL, 0, NULL, NULL);
+fail:
+        /*
+         * We can safely remove these entries, even if failure.
+         * Calling code will receive an error and will know that
+         * the log facility has failed.
+         */
+        list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) {
+                list_del(&fe->list);
+                mempool_free(fe, flush_entry_pool);
+        }
+        if (r)
+                dm_table_event(lc->ti->table);
+        return r;
+}
+/*
+ * userspace_mark_region
+ *
+ * This function should avoid blocking unless absolutely required.
+ * (Memory allocation is valid for blocking.)
+ */
+static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
+{
+        unsigned long flags;
+        struct log_c *lc = log->context;
+        struct flush_entry *fe;
+        /* Wait for an allocation, but _never_ fail */
+        fe = mempool_alloc(flush_entry_pool, GFP_NOIO);
+        BUG_ON(!fe);
+        spin_lock_irqsave(&lc->flush_lock, flags);
+        fe->type = DM_ULOG_MARK_REGION;
+        fe->region = region;
+        list_add(&fe->list, &lc->flush_list);
+        spin_unlock_irqrestore(&lc->flush_lock, flags);
+        return;
+}
+/*
+ * userspace_clear_region
+ *
+ * This function must not block.
+ * So, the alloc can't block.  In the worst case, it is ok to
+ * fail.  It would simply mean we can't clear the region.
+ * Does nothing to current sync context, but does mean
+ * the region will be re-sync'ed on a reload of the mirror
+ * even though it is in-sync.
+ */
+static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
+{
+        unsigned long flags;
+        struct log_c *lc = log->context;
+        struct flush_entry *fe;
+        /*
+         * If we fail to allocate, we skip the clearing of
+         * the region.  This doesn't hurt us in any way, except
+         * to cause the region to be resync'ed when the
+         * device is activated next time.
+         */
+        fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC);
+        if (!fe) {
+                DMERR("Failed to allocate memory to clear region.");
+                return;
+        }
+        spin_lock_irqsave(&lc->flush_lock, flags);
+        fe->type = DM_ULOG_CLEAR_REGION;
+        fe->region = region;
+        list_add(&fe->list, &lc->flush_list);
+        spin_unlock_irqrestore(&lc->flush_lock, flags);
+        return;
+}
+/*
+ * userspace_get_resync_work
+ *
+ * Get a region that needs recovery.  It is valid to return
+ * an error for this function.
+ *
+ * Returns: 1 if region filled, 0 if no work, <0 on error
+ */
+static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region)
+{
+        int r;
+        size_t rdata_size;
+        struct log_c *lc = log->context;
+        struct {
+                int64_t i; /* 64-bit for mix arch compatibility */
+                region_t r;
+        } pkg;
+        if (lc->in_sync_hint >= lc->region_count)
+                return 0;
+        rdata_size = sizeof(pkg);
+        r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK,
+                                 NULL, 0,
+                                 (char *)&pkg, &rdata_size);
+        *region = pkg.r;
+        return (r) ? r : (int)pkg.i;
+}
+/*
+ * userspace_set_region_sync
+ *
+ * Set the sync status of a given region.  This function
+ * must not fail.
+ */
+static void userspace_set_region_sync(struct dm_dirty_log *log,
+                                      region_t region, int in_sync)
+{
+        int r;
+        struct log_c *lc = log->context;
+        struct {
+                region_t r;
+                int64_t i;
+        } pkg;
+        pkg.r = region;
+        pkg.i = (int64_t)in_sync;
+        r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
+                                 (char *)&pkg, sizeof(pkg),
+                                 NULL, NULL);
+        /*
+         * It would be nice to be able to report failures.
+         * However, it is easy emough to detect and resolve.
+         */
+        return;
+}
+/*
+ * userspace_get_sync_count
+ *
+ * If there is any sort of failure when consulting the server,
+ * we assume that the sync count is zero.
+ *
+ * Returns: sync count on success, 0 on failure
+ */
+static region_t userspace_get_sync_count(struct dm_dirty_log *log)
+{
+        int r;
+        size_t rdata_size;
+        uint64_t sync_count;
+        struct log_c *lc = log->context;
+        rdata_size = sizeof(sync_count);
+        r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT,
+                                 NULL, 0,
+                                 (char *)&sync_count, &rdata_size);
+        if (r)
+                return 0;
+        if (sync_count >= lc->region_count)
+                lc->in_sync_hint = lc->region_count;
+        return (region_t)sync_count;
+}
+/*
+ * userspace_status
+ *
+ * Returns: amount of space consumed
+ */
+static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
+                            char *result, unsigned maxlen)
+{
+        int r = 0;
+        size_t sz = (size_t)maxlen;
+        struct log_c *lc = log->context;
+        switch (status_type) {
+        case STATUSTYPE_INFO:
+                r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO,
+                                         NULL, 0,
+                                         result, &sz);
+                if (r) {
+                        sz = 0;
+                        DMEMIT("%s 1 COM_FAILURE", log->type->name);
+                }
+                break;
+        case STATUSTYPE_TABLE:
+                sz = 0;
+                DMEMIT("%s %u %s %s", log->type->name, lc->usr_argc + 1,
+                       lc->uuid, lc->usr_argv_str);
+                break;
+        }
+        return (r) ? 0 : (int)sz;
+}
+/*
+ * userspace_is_remote_recovering
+ *
+ * Returns: 1 if region recovering, 0 otherwise
+ */
+static int userspace_is_remote_recovering(struct dm_dirty_log *log,
+                                          region_t region)
+{
+        int r;
+        uint64_t region64 = region;
+        struct log_c *lc = log->context;
+        static unsigned long long limit;
+        struct {
+                int64_t is_recovering;
+                uint64_t in_sync_hint;
+        } pkg;
+        size_t rdata_size = sizeof(pkg);
+        /*
+         * Once the mirror has been reported to be in-sync,
+         * it will never again ask for recovery work.  So,
+         * we can safely say there is not a remote machine
+         * recovering if the device is in-sync.  (in_sync_hint
+         * must be reset at resume time.)
+         */
+        if (region < lc->in_sync_hint)
+                return 0;
+        else if (jiffies < limit)
+                return 1;
+        limit = jiffies + (HZ / 4);
+        r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_REMOTE_RECOVERING,
+                                 (char *)&region64, sizeof(region64),
+                                 (char *)&pkg, &rdata_size);
+        if (r)
+                return 1;
+        lc->in_sync_hint = pkg.in_sync_hint;
+        return (int)pkg.is_recovering;
+}
+static struct dm_dirty_log_type _userspace_type = {
+        .name = "userspace",
+        .module = THIS_MODULE,
+        .ctr = userspace_ctr,
+        .dtr = userspace_dtr,
+        .presuspend = userspace_presuspend,
+        .postsuspend = userspace_postsuspend,
+        .resume = userspace_resume,
+        .get_region_size = userspace_get_region_size,
+        .is_clean = userspace_is_clean,
+        .in_sync = userspace_in_sync,
+        .flush = userspace_flush,
+        .mark_region = userspace_mark_region,
+        .clear_region = userspace_clear_region,
+        .get_resync_work = userspace_get_resync_work,
+        .set_region_sync = userspace_set_region_sync,
+        .get_sync_count = userspace_get_sync_count,
+        .status = userspace_status,
+        .is_remote_recovering = userspace_is_remote_recovering,
+};
+static int __init userspace_dirty_log_init(void)
+{
+        int r = 0;
+        flush_entry_pool = mempool_create(100, flush_entry_alloc,
+                                          flush_entry_free, NULL);
+        if (!flush_entry_pool) {
+                DMWARN("Unable to create flush_entry_pool:  No memory.");
+                return -ENOMEM;
+        }
+        r = dm_ulog_tfr_init();
+        if (r) {
+                DMWARN("Unable to initialize userspace log communications");
+                mempool_destroy(flush_entry_pool);
+                return r;
+        }
+        r = dm_dirty_log_type_register(&_userspace_type);
+        if (r) {
+                DMWARN("Couldn't register userspace dirty log type");
+                dm_ulog_tfr_exit();
+                mempool_destroy(flush_entry_pool);
+                return r;
+        }
+        DMINFO("version 1.0.0 loaded");
+        return 0;
+}
+static void __exit userspace_dirty_log_exit(void)
+{
+        dm_dirty_log_type_unregister(&_userspace_type);
+        dm_ulog_tfr_exit();
+        mempool_destroy(flush_entry_pool);
+        DMINFO("version 1.0.0 unloaded");
+        return;
+}
+module_init(userspace_dirty_log_init);
+module_exit(userspace_dirty_log_exit);
+MODULE_DESCRIPTION(DM_NAME " userspace dirty log link");
+MODULE_AUTHOR("Jonathan Brassow <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
new file mode 100644
index 000000000000..0ca1ee768a1f
--- /dev/null
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -0,0 +1,276 @@
+/*
+ * Copyright (C) 2006-2009 Red Hat, Inc.
+ *
+ * This file is released under the LGPL.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <net/sock.h>
+#include <linux/workqueue.h>
+#include <linux/connector.h>
+#include <linux/device-mapper.h>
+#include <linux/dm-log-userspace.h>
+#include "dm-log-userspace-transfer.h"
+static uint32_t dm_ulog_seq;
+/*
+ * Netlink/Connector is an unreliable protocol.  How long should
+ * we wait for a response before assuming it was lost and retrying?
+ * (If we do receive a response after this time, it will be discarded
+ * and the response to the resent request will be waited for.
+ */
+#define DM_ULOG_RETRY_TIMEOUT (15 * HZ)
+/*
+ * Pre-allocated space for speed
+ */
+#define DM_ULOG_PREALLOCED_SIZE 512
+static struct cn_msg *prealloced_cn_msg;
+static struct dm_ulog_request *prealloced_ulog_tfr;
+static struct cb_id ulog_cn_id = {
+        .idx = CN_IDX_DM,
+        .val = CN_VAL_DM_USERSPACE_LOG
+};
+static DEFINE_MUTEX(dm_ulog_lock);
+struct receiving_pkg {
+        struct list_head list;
+        struct completion complete;
+        uint32_t seq;
+        int error;
+        size_t *data_size;
+        char *data;
+};
+static DEFINE_SPINLOCK(receiving_list_lock);
+static struct list_head receiving_list;
+static int dm_ulog_sendto_server(struct dm_ulog_request *tfr)
+{
+        int r;
+        struct cn_msg *msg = prealloced_cn_msg;
+        memset(msg, 0, sizeof(struct cn_msg));
+        msg->id.idx = ulog_cn_id.idx;
+        msg->id.val = ulog_cn_id.val;
+        msg->ack = 0;
+        msg->seq = tfr->seq;
+        msg->len = sizeof(struct dm_ulog_request) + tfr->data_size;
+        r = cn_netlink_send(msg, 0, gfp_any());
+        return r;
+}
+/*
+ * Parameters for this function can be either msg or tfr, but not
+ * both.  This function fills in the reply for a waiting request.
+ * If just msg is given, then the reply is simply an ACK from userspace
+ * that the request was received.
+ *
+ * Returns: 0 on success, -ENOENT on failure
+ */
+static int fill_pkg(struct cn_msg *msg, struct dm_ulog_request *tfr)
+{
+        uint32_t rtn_seq = (msg) ? msg->seq : (tfr) ? tfr->seq : 0;
+        struct receiving_pkg *pkg;
+        /*
+         * The 'receiving_pkg' entries in this list are statically
+         * allocated on the stack in 'dm_consult_userspace'.
+         * Each process that is waiting for a reply from the user
+         * space server will have an entry in this list.
+         *
+         * We are safe to do it this way because the stack space
+         * is unique to each process, but still addressable by
+         * other processes.
+         */
+        list_for_each_entry(pkg, &receiving_list, list) {
+                if (rtn_seq != pkg->seq)
+                        continue;
+                if (msg) {
+                        pkg->error = -msg->ack;
+                        /*
+                         * If we are trying again, we will need to know our
+                         * storage capacity.  Otherwise, along with the
+                         * error code, we make explicit that we have no data.
+                         */
+                        if (pkg->error != -EAGAIN)
+                                *(pkg->data_size) = 0;
+                } else if (tfr->data_size > *(pkg->data_size)) {
+                        DMERR("Insufficient space to receive package [%u] "
+                              "(%u vs %lu)", tfr->request_type,
+                              tfr->data_size, *(pkg->data_size));
+                        *(pkg->data_size) = 0;
+                        pkg->error = -ENOSPC;
+                } else {
+                        pkg->error = tfr->error;
+                        memcpy(pkg->data, tfr->data, tfr->data_size);
+                        *(pkg->data_size) = tfr->data_size;
+                }
+                complete(&pkg->complete);
+                return 0;
+        }
+        return -ENOENT;
+}
+/*
+ * This is the connector callback that delivers data
+ * that was sent from userspace.
+ */
+static void cn_ulog_callback(void *data)
+{
+        struct cn_msg *msg = (struct cn_msg *)data;
+        struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1);
+        spin_lock(&receiving_list_lock);
+        if (msg->len == 0)
+                fill_pkg(msg, NULL);
+        else if (msg->len < sizeof(*tfr))
+                DMERR("Incomplete message received (expected %u, got %u): [%u]",
+                      (unsigned)sizeof(*tfr), msg->len, msg->seq);
+        else
+                fill_pkg(NULL, tfr);
+        spin_unlock(&receiving_list_lock);
+}
+/**
+ * dm_consult_userspace
+ * @uuid: log's uuid (must be DM_UUID_LEN in size)
+ * @request_type:  found in include/linux/dm-log-userspace.h
+ * @data: data to tx to the server
+ * @data_size: size of data in bytes
+ * @rdata: place to put return data from server
+ * @rdata_size: value-result (amount of space given/amount of space used)
+ *
+ * rdata_size is undefined on failure.
+ *
+ * Memory used to communicate with userspace is zero'ed
+ * before populating to ensure that no unwanted bits leak
+ * from kernel space to user-space.  All userspace log communications
+ * between kernel and user space go through this function.
+ *
+ * Returns: 0 on success, -EXXX on failure
+ **/
+int dm_consult_userspace(const char *uuid, int request_type,
+                         char *data, size_t data_size,
+                         char *rdata, size_t *rdata_size)
+{
+        int r = 0;
+        size_t dummy = 0;
+        int overhead_size =
+                sizeof(struct dm_ulog_request *) + sizeof(struct cn_msg);
+        struct dm_ulog_request *tfr = prealloced_ulog_tfr;
+        struct receiving_pkg pkg;
+        if (data_size > (DM_ULOG_PREALLOCED_SIZE - overhead_size)) {
+                DMINFO("Size of tfr exceeds preallocated size");
+                return -EINVAL;
+        }
+        if (!rdata_size)
+                rdata_size = &dummy;
+resend:
+        /*
+         * We serialize the sending of requests so we can
+         * use the preallocated space.
+         */
+        mutex_lock(&dm_ulog_lock);
+        memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - overhead_size);
+        memcpy(tfr->uuid, uuid, DM_UUID_LEN);
+        tfr->seq = dm_ulog_seq++;
+        /*
+         * Must be valid request type (all other bits set to
+         * zero).  This reserves other bits for possible future
+         * use.
+         */
+        tfr->request_type = request_type & DM_ULOG_REQUEST_MASK;
+        tfr->data_size = data_size;
+        if (data && data_size)
+                memcpy(tfr->data, data, data_size);
+        memset(&pkg, 0, sizeof(pkg));
+        init_completion(&pkg.complete);
+        pkg.seq = tfr->seq;
+        pkg.data_size = rdata_size;
+        pkg.data = rdata;
+        spin_lock(&receiving_list_lock);
+        list_add(&(pkg.list), &receiving_list);
+        spin_unlock(&receiving_list_lock);
+        r = dm_ulog_sendto_server(tfr);
+        mutex_unlock(&dm_ulog_lock);
+        if (r) {
+                DMERR("Unable to send log request [%u] to userspace: %d",
+                      request_type, r);
+                spin_lock(&receiving_list_lock);
+                list_del_init(&(pkg.list));
+                spin_unlock(&receiving_list_lock);
+                goto out;
+        }
+        r = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT);
+        spin_lock(&receiving_list_lock);
+        list_del_init(&(pkg.list));
+        spin_unlock(&receiving_list_lock);
+        if (!r) {
+                DMWARN("[%s] Request timed out: [%u/%u] - retrying",
+                       (strlen(uuid) > 8) ?
+                       (uuid + (strlen(uuid) - 8)) : (uuid),
+                       request_type, pkg.seq);
+                goto resend;
+        }
+        r = pkg.error;
+        if (r == -EAGAIN)
+                goto resend;
+out:
+        return r;
+}
+int dm_ulog_tfr_init(void)
+{
+        int r;
+        void *prealloced;
+        INIT_LIST_HEAD(&receiving_list);
+        prealloced = kmalloc(DM_ULOG_PREALLOCED_SIZE, GFP_KERNEL);
+        if (!prealloced)
+                return -ENOMEM;
+        prealloced_cn_msg = prealloced;
+        prealloced_ulog_tfr = prealloced + sizeof(struct cn_msg);
+        r = cn_add_callback(&ulog_cn_id, "dmlogusr", cn_ulog_callback);
+        if (r) {
+                cn_del_callback(&ulog_cn_id);
+                return r;
+        }
+        return 0;
+}
+void dm_ulog_tfr_exit(void)
+{
+        cn_del_callback(&ulog_cn_id);
+        kfree(prealloced_cn_msg);
+}
diff --git a/drivers/md/dm-log-userspace-transfer.h b/drivers/md/dm-log-userspace-transfer.h
new file mode 100644
index 000000000000..c26d8e4e2710
--- /dev/null
+++ b/drivers/md/dm-log-userspace-transfer.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) 2006-2009 Red Hat, Inc.
+ *
+ * This file is released under the LGPL.
+ */
+#ifndef __DM_LOG_USERSPACE_TRANSFER_H__
+#define __DM_LOG_USERSPACE_TRANSFER_H__
+#define DM_MSG_PREFIX "dm-log-userspace"
+int dm_ulog_tfr_init(void);
+void dm_ulog_tfr_exit(void);
+int dm_consult_userspace(const char *uuid, int request_type,
+                         char *data, size_t data_size,
+                         char *rdata, size_t *rdata_size);
+#endif /* __DM_LOG_USERSPACE_TRANSFER_H__ */
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index be233bc4d917..9443896ede07 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -412,10 +412,12 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
                /*
                 * Buffer holds both header and bitset.
                 */
-                buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) +
+                buf_size =
-                                       bitset_size, ti->limits.hardsect_size);
+                    dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + bitset_size,
+                                bdev_logical_block_size(lc->header_location.
+                                                            bdev));
-                if (buf_size > dev->bdev->bd_inode->i_size) {
+                if (buf_size > i_size_read(dev->bdev->bd_inode)) {
                        DMWARN("log device %s too small: need %llu bytes",
                                dev->name, (unsigned long long)buf_size);
                        kfree(lc);
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 6a386ab4f7eb..c70604a20897 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -8,7 +8,6 @@
 #include <linux/device-mapper.h>
 #include "dm-path-selector.h"
-#include "dm-bio-record.h"
 #include "dm-uevent.h"
 #include <linux/ctype.h>
@@ -35,6 +34,7 @@ struct pgpath {
        struct dm_path path;
        struct work_struct deactivate_path;
+        struct work_struct activate_path;
 };
 #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
@@ -64,8 +64,6 @@ struct multipath {
        spinlock_t lock;
        const char *hw_handler_name;
-        struct work_struct activate_path;
-        struct pgpath *pgpath_to_activate;
        unsigned nr_priority_groups;
        struct list_head priority_groups;
        unsigned pg_init_required;      /* pg_init needs calling? */
@@ -84,7 +82,7 @@ struct multipath {
        unsigned pg_init_count;         /* Number of times pg_init called */
        struct work_struct process_queued_ios;
-        struct bio_list queued_ios;
+        struct list_head queued_ios;
        unsigned queue_size;
        struct work_struct trigger_event;
@@ -101,7 +99,7 @@ struct multipath {
 */
 struct dm_mpath_io {
        struct pgpath *pgpath;
-        struct dm_bio_details details;
+        size_t nr_bytes;
 };
 typedef int (*action_fn) (struct pgpath *pgpath);
@@ -128,6 +126,7 @@ static struct pgpath *alloc_pgpath(void)
        if (pgpath) {
                pgpath->is_active = 1;
                INIT_WORK(&pgpath->deactivate_path, deactivate_path);
+                INIT_WORK(&pgpath->activate_path, activate_path);
        }
        return pgpath;
@@ -160,7 +159,6 @@ static struct priority_group *alloc_priority_group(void)
 static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
 {
-        unsigned long flags;
        struct pgpath *pgpath, *tmp;
        struct multipath *m = ti->private;
@@ -169,10 +167,6 @@ static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
                if (m->hw_handler_name)
                        scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev));
                dm_put_device(ti, pgpath->path.dev);
-                spin_lock_irqsave(&m->lock, flags);
-                if (m->pgpath_to_activate == pgpath)
-                        m->pgpath_to_activate = NULL;
-                spin_unlock_irqrestore(&m->lock, flags);
                free_pgpath(pgpath);
        }
 }
@@ -198,11 +192,11 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
        m = kzalloc(sizeof(*m), GFP_KERNEL);
        if (m) {
                INIT_LIST_HEAD(&m->priority_groups);
+                INIT_LIST_HEAD(&m->queued_ios);
                spin_lock_init(&m->lock);
                m->queue_io = 1;
                INIT_WORK(&m->process_queued_ios, process_queued_ios);
                INIT_WORK(&m->trigger_event, trigger_event);
-                INIT_WORK(&m->activate_path, activate_path);
                m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
                if (!m->mpio_pool) {
                        kfree(m);
@@ -250,11 +244,12 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
        m->pg_init_count = 0;
 }
-static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg)
+static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg,
+                               size_t nr_bytes)
 {
        struct dm_path *path;
-        path = pg->ps.type->select_path(&pg->ps, &m->repeat_count);
+        path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes);
        if (!path)
                return -ENXIO;
@@ -266,7 +261,7 @@ static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg)
        return 0;
 }
-static void __choose_pgpath(struct multipath *m)
+static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
 {
        struct priority_group *pg;
        unsigned bypassed = 1;
@@ -278,12 +273,12 @@ static void __choose_pgpath(struct multipath *m)
        if (m->next_pg) {
                pg = m->next_pg;
                m->next_pg = NULL;
-                if (!__choose_path_in_pg(m, pg))
+                if (!__choose_path_in_pg(m, pg, nr_bytes))
                        return;
        }
        /* Don't change PG until it has no remaining paths */
-        if (m->current_pg && !__choose_path_in_pg(m, m->current_pg))
+        if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes))
                return;
        /*
@@ -295,7 +290,7 @@ static void __choose_pgpath(struct multipath *m)
                list_for_each_entry(pg, &m->priority_groups, list) {
                        if (pg->bypassed == bypassed)
                                continue;
-                        if (!__choose_path_in_pg(m, pg))
+                        if (!__choose_path_in_pg(m, pg, nr_bytes))
                                return;
                }
        } while (bypassed--);
@@ -322,19 +317,21 @@ static int __must_push_back(struct multipath *m)
                dm_noflush_suspending(m->ti));
 }
-static int map_io(struct multipath *m, struct bio *bio,
+static int map_io(struct multipath *m, struct request *clone,
                  struct dm_mpath_io *mpio, unsigned was_queued)
 {
        int r = DM_MAPIO_REMAPPED;
+        size_t nr_bytes = blk_rq_bytes(clone);
        unsigned long flags;
        struct pgpath *pgpath;
+        struct block_device *bdev;
        spin_lock_irqsave(&m->lock, flags);
        /* Do we need to select a new pgpath? */
        if (!m->current_pgpath ||
            (!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
-                __choose_pgpath(m);
+                __choose_pgpath(m, nr_bytes);
        pgpath = m->current_pgpath;
@@ -344,21 +341,28 @@ static int map_io(struct multipath *m, struct bio *bio,
        if ((pgpath && m->queue_io) ||
            (!pgpath && m->queue_if_no_path)) {
                /* Queue for the daemon to resubmit */
-                bio_list_add(&m->queued_ios, bio);
+                list_add_tail(&clone->queuelist, &m->queued_ios);
                m->queue_size++;
                if ((m->pg_init_required && !m->pg_init_in_progress) ||
                    !m->queue_io)
                        queue_work(kmultipathd, &m->process_queued_ios);
                pgpath = NULL;
                r = DM_MAPIO_SUBMITTED;
-        } else if (pgpath)
+        } else if (pgpath) {
-                bio->bi_bdev = pgpath->path.dev->bdev;
+                bdev = pgpath->path.dev->bdev;
-        else if (__must_push_back(m))
+                clone->q = bdev_get_queue(bdev);
+                clone->rq_disk = bdev->bd_disk;
+        } else if (__must_push_back(m))
                r = DM_MAPIO_REQUEUE;
        else
                r = -EIO;       /* Failed */
        mpio->pgpath = pgpath;
+        mpio->nr_bytes = nr_bytes;
+        if (r == DM_MAPIO_REMAPPED && pgpath->pg->ps.type->start_io)
+                pgpath->pg->ps.type->start_io(&pgpath->pg->ps, &pgpath->path,
+                                              nr_bytes);
        spin_unlock_irqrestore(&m->lock, flags);
@@ -396,30 +400,31 @@ static void dispatch_queued_ios(struct multipath *m)
 {
        int r;
        unsigned long flags;
-        struct bio *bio = NULL, *next;
        struct dm_mpath_io *mpio;
        union map_info *info;
+        struct request *clone, *n;
+        LIST_HEAD(cl);
        spin_lock_irqsave(&m->lock, flags);
-        bio = bio_list_get(&m->queued_ios);
+        list_splice_init(&m->queued_ios, &cl);
        spin_unlock_irqrestore(&m->lock, flags);
-        while (bio) {
+        list_for_each_entry_safe(clone, n, &cl, queuelist) {
-                next = bio->bi_next;
+                list_del_init(&clone->queuelist);
-                bio->bi_next = NULL;
-                info = dm_get_mapinfo(bio);
+                info = dm_get_rq_mapinfo(clone);
                mpio = info->ptr;
-                r = map_io(m, bio, mpio, 1);
+                r = map_io(m, clone, mpio, 1);
-                if (r < 0)
+                if (r < 0) {
-                        bio_endio(bio, r);
+                        mempool_free(mpio, m->mpio_pool);
-                else if (r == DM_MAPIO_REMAPPED)
+                        dm_kill_unmapped_request(clone, r);
-                        generic_make_request(bio);
+                } else if (r == DM_MAPIO_REMAPPED)
-                else if (r == DM_MAPIO_REQUEUE)
+                        dm_dispatch_request(clone);
-                        bio_endio(bio, -EIO);
+                else if (r == DM_MAPIO_REQUEUE) {
+                        mempool_free(mpio, m->mpio_pool);
-                bio = next;
+                        dm_requeue_unmapped_request(clone);
+                }
        }
 }
@@ -427,8 +432,8 @@ static void process_queued_ios(struct work_struct *work)
 {
        struct multipath *m =
                container_of(work, struct multipath, process_queued_ios);
-        struct pgpath *pgpath = NULL;
+        struct pgpath *pgpath = NULL, *tmp;
-        unsigned init_required = 0, must_queue = 1;
+        unsigned must_queue = 1;
        unsigned long flags;
        spin_lock_irqsave(&m->lock, flags);
@@ -437,7 +442,7 @@ static void process_queued_ios(struct work_struct *work)
                goto out;
        if (!m->current_pgpath)
-                __choose_pgpath(m);
+                __choose_pgpath(m, 0);
        pgpath = m->current_pgpath;
@@ -446,19 +451,15 @@ static void process_queued_ios(struct work_struct *work)
                must_queue = 0;
        if (m->pg_init_required && !m->pg_init_in_progress && pgpath) {
-                m->pgpath_to_activate = pgpath;
                m->pg_init_count++;
                m->pg_init_required = 0;
-                m->pg_init_in_progress = 1;
+                list_for_each_entry(tmp, &pgpath->pg->pgpaths, list) {
-                init_required = 1;
+                        if (queue_work(kmpath_handlerd, &tmp->activate_path))
+                                m->pg_init_in_progress++;
+                }
        }
 out:
        spin_unlock_irqrestore(&m->lock, flags);
-        if (init_required)
-                queue_work(kmpath_handlerd, &m->activate_path);
        if (!must_queue)
                dispatch_queued_ios(m);
 }
@@ -553,6 +554,12 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
                return -EINVAL;
        }
+        if (ps_argc > as->argc) {
+                dm_put_path_selector(pst);
+                ti->error = "not enough arguments for path selector";
+                return -EINVAL;
+        }
        r = pst->create(&pg->ps, ps_argc, as->argv);
        if (r) {
                dm_put_path_selector(pst);
@@ -591,9 +598,20 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
        }
        if (m->hw_handler_name) {
-                r = scsi_dh_attach(bdev_get_queue(p->path.dev->bdev),
+                struct request_queue *q = bdev_get_queue(p->path.dev->bdev);
-                                   m->hw_handler_name);
+                r = scsi_dh_attach(q, m->hw_handler_name);
+                if (r == -EBUSY) {
+                        /*
+                         * Already attached to different hw_handler,
+                         * try to reattach with correct one.
+                         */
+                        scsi_dh_detach(q);
+                        r = scsi_dh_attach(q, m->hw_handler_name);
+                }
                if (r < 0) {
+                        ti->error = "error attaching hardware handler";
                        dm_put_device(ti, p->path.dev);
                        goto bad;
                }
@@ -699,6 +717,11 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m)
        if (!hw_argc)
                return 0;
+        if (hw_argc > as->argc) {
+                ti->error = "not enough arguments for hardware handler";
+                return -EINVAL;
+        }
        m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL);
        request_module("scsi_dh_%s", m->hw_handler_name);
        if (scsi_dh_handler_exist(m->hw_handler_name) == 0) {
@@ -823,6 +846,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
                goto bad;
        }
+        ti->num_flush_requests = 1;
        return 0;
 bad:
@@ -836,25 +861,29 @@ static void multipath_dtr(struct dm_target *ti)
        flush_workqueue(kmpath_handlerd);
        flush_workqueue(kmultipathd);
+        flush_scheduled_work();
        free_multipath(m);
 }
 /*
- * Map bios, recording original fields for later in case we have to resubmit
+ * Map cloned requests
 */
-static int multipath_map(struct dm_target *ti, struct bio *bio,
+static int multipath_map(struct dm_target *ti, struct request *clone,
                         union map_info *map_context)
 {
        int r;
        struct dm_mpath_io *mpio;
        struct multipath *m = (struct multipath *) ti->private;
-        mpio = mempool_alloc(m->mpio_pool, GFP_NOIO);
+        mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
-        dm_bio_record(&mpio->details, bio);
+        if (!mpio)
+                /* ENOMEM, requeue */
+                return DM_MAPIO_REQUEUE;
+        memset(mpio, 0, sizeof(*mpio));
        map_context->ptr = mpio;
-        bio->bi_rw |= (1 << BIO_RW_FAILFAST_TRANSPORT);
+        clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
-        r = map_io(m, bio, mpio, 0);
+        r = map_io(m, clone, mpio, 0);
        if (r < 0 || r == DM_MAPIO_REQUEUE)
                mempool_free(mpio, m->mpio_pool);
@@ -924,9 +953,13 @@ static int reinstate_path(struct pgpath *pgpath)
        pgpath->is_active = 1;
-        m->current_pgpath = NULL;
+        if (!m->nr_valid_paths++ && m->queue_size) {
-        if (!m->nr_valid_paths++ && m->queue_size)
+                m->current_pgpath = NULL;
                queue_work(kmultipathd, &m->process_queued_ios);
+        } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
+                if (queue_work(kmpath_handlerd, &pgpath->activate_path))
+                        m->pg_init_in_progress++;
+        }
        dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
                      pgpath->path.dev->name, m->nr_valid_paths);
@@ -1102,87 +1135,70 @@ static void pg_init_done(struct dm_path *path, int errors)
        spin_lock_irqsave(&m->lock, flags);
        if (errors) {
-                DMERR("Could not failover device. Error %d.", errors);
+                if (pgpath == m->current_pgpath) {
-                m->current_pgpath = NULL;
+                        DMERR("Could not failover device. Error %d.", errors);
-                m->current_pg = NULL;
+                        m->current_pgpath = NULL;
+                        m->current_pg = NULL;
+                }
        } else if (!m->pg_init_required) {
                m->queue_io = 0;
                pg->bypassed = 0;
        }
-        m->pg_init_in_progress = 0;
+        m->pg_init_in_progress--;
-        queue_work(kmultipathd, &m->process_queued_ios);
+        if (!m->pg_init_in_progress)
+                queue_work(kmultipathd, &m->process_queued_ios);
        spin_unlock_irqrestore(&m->lock, flags);
 }
 static void activate_path(struct work_struct *work)
 {
        int ret;
-        struct multipath *m =
+        struct pgpath *pgpath =
-                container_of(work, struct multipath, activate_path);
+                container_of(work, struct pgpath, activate_path);
-        struct dm_path *path;
-        unsigned long flags;
-        spin_lock_irqsave(&m->lock, flags);
+        ret = scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev));
-        path = &m->pgpath_to_activate->path;
+        pg_init_done(&pgpath->path, ret);
-        m->pgpath_to_activate = NULL;
-        spin_unlock_irqrestore(&m->lock, flags);
-        if (!path)
-                return;
-        ret = scsi_dh_activate(bdev_get_queue(path->dev->bdev));
-        pg_init_done(path, ret);
 }
 /*
 * end_io handling
 */
-static int do_end_io(struct multipath *m, struct bio *bio,
+static int do_end_io(struct multipath *m, struct request *clone,
                     int error, struct dm_mpath_io *mpio)
 {
+        /*
+         * We don't queue any clone request inside the multipath target
+         * during end I/O handling, since those clone requests don't have
+         * bio clones.  If we queue them inside the multipath target,
+         * we need to make bio clones, that requires memory allocation.
+         * (See drivers/md/dm.c:end_clone_bio() about why the clone requests
+         *  don't have bio clones.)
+         * Instead of queueing the clone request here, we queue the original
+         * request into dm core, which will remake a clone request and
+         * clone bios for it and resubmit it later.
+         */
+        int r = DM_ENDIO_REQUEUE;
        unsigned long flags;
-        if (!error)
+        if (!error && !clone->errors)
                return 0;       /* I/O complete */
-        if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
-                return error;
        if (error == -EOPNOTSUPP)
                return error;
-        spin_lock_irqsave(&m->lock, flags);
-        if (!m->nr_valid_paths) {
-                if (__must_push_back(m)) {
-                        spin_unlock_irqrestore(&m->lock, flags);
-                        return DM_ENDIO_REQUEUE;
-                } else if (!m->queue_if_no_path) {
-                        spin_unlock_irqrestore(&m->lock, flags);
-                        return -EIO;
-                } else {
-                        spin_unlock_irqrestore(&m->lock, flags);
-                        goto requeue;
-                }
-        }
-        spin_unlock_irqrestore(&m->lock, flags);
        if (mpio->pgpath)
                fail_path(mpio->pgpath);
-      requeue:
-        dm_bio_restore(&mpio->details, bio);
-        /* queue for the daemon to resubmit or fail */
        spin_lock_irqsave(&m->lock, flags);
-        bio_list_add(&m->queued_ios, bio);
+        if (!m->nr_valid_paths && !m->queue_if_no_path && !__must_push_back(m))
-        m->queue_size++;
+                r = -EIO;
-        if (!m->queue_io)
-                queue_work(kmultipathd, &m->process_queued_ios);
        spin_unlock_irqrestore(&m->lock, flags);
-        return DM_ENDIO_INCOMPLETE;     /* io not complete */
+        return r;
 }
-static int multipath_end_io(struct dm_target *ti, struct bio *bio,
+static int multipath_end_io(struct dm_target *ti, struct request *clone,
                            int error, union map_info *map_context)
 {
        struct multipath *m = ti->private;
@@ -1191,14 +1207,13 @@ static int multipath_end_io(struct dm_target *ti, struct bio *bio,
        struct path_selector *ps;
        int r;
-        r  = do_end_io(m, bio, error, mpio);
+        r  = do_end_io(m, clone, error, mpio);
        if (pgpath) {
                ps = &pgpath->pg->ps;
                if (ps->type->end_io)
-                        ps->type->end_io(ps, &pgpath->path);
+                        ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
        }
-        if (r != DM_ENDIO_INCOMPLETE)
+        mempool_free(mpio, m->mpio_pool);
-                mempool_free(mpio, m->mpio_pool);
        return r;
 }
@@ -1411,7 +1426,7 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
        spin_lock_irqsave(&m->lock, flags);
        if (!m->current_pgpath)
-                __choose_pgpath(m);
+                __choose_pgpath(m, 0);
        if (m->current_pgpath) {
                bdev = m->current_pgpath->path.dev->bdev;
@@ -1428,22 +1443,113 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
        return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
 }
+static int multipath_iterate_devices(struct dm_target *ti,
+                                     iterate_devices_callout_fn fn, void *data)
+{
+        struct multipath *m = ti->private;
+        struct priority_group *pg;
+        struct pgpath *p;
+        int ret = 0;
+        list_for_each_entry(pg, &m->priority_groups, list) {
+                list_for_each_entry(p, &pg->pgpaths, list) {
+                        ret = fn(ti, p->path.dev, ti->begin, data);
+                        if (ret)
+                                goto out;
+                }
+        }
+out:
+        return ret;
+}
+static int __pgpath_busy(struct pgpath *pgpath)
+{
+        struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
+        return dm_underlying_device_busy(q);
+}
+/*
+ * We return "busy", only when we can map I/Os but underlying devices
+ * are busy (so even if we map I/Os now, the I/Os will wait on
+ * the underlying queue).
+ * In other words, if we want to kill I/Os or queue them inside us
+ * due to map unavailability, we don't return "busy".  Otherwise,
+ * dm core won't give us the I/Os and we can't do what we want.
+ */
+static int multipath_busy(struct dm_target *ti)
+{
+        int busy = 0, has_active = 0;
+        struct multipath *m = ti->private;
+        struct priority_group *pg;
+        struct pgpath *pgpath;
+        unsigned long flags;
+        spin_lock_irqsave(&m->lock, flags);
+        /* Guess which priority_group will be used at next mapping time */
+        if (unlikely(!m->current_pgpath && m->next_pg))
+                pg = m->next_pg;
+        else if (likely(m->current_pg))
+                pg = m->current_pg;
+        else
+                /*
+                 * We don't know which pg will be used at next mapping time.
+                 * We don't call __choose_pgpath() here to avoid to trigger
+                 * pg_init just by busy checking.
+                 * So we don't know whether underlying devices we will be using
+                 * at next mapping time are busy or not. Just try mapping.
+                 */
+                goto out;
+        /*
+         * If there is one non-busy active path at least, the path selector
+         * will be able to select it. So we consider such a pg as not busy.
+         */
+        busy = 1;
+        list_for_each_entry(pgpath, &pg->pgpaths, list)
+                if (pgpath->is_active) {
+                        has_active = 1;
+                        if (!__pgpath_busy(pgpath)) {
+                                busy = 0;
+                                break;
+                        }
+                }
+        if (!has_active)
+                /*
+                 * No active path in this pg, so this pg won't be used and
+                 * the current_pg will be changed at next mapping time.
+                 * We need to try mapping to determine it.
+                 */
+                busy = 0;
+out:
+        spin_unlock_irqrestore(&m->lock, flags);
+        return busy;
+}
 /*-----------------------------------------------------------------
 * Module setup
 *---------------------------------------------------------------*/
 static struct target_type multipath_target = {
        .name = "multipath",
-        .version = {1, 0, 5},
+        .version = {1, 1, 0},
        .module = THIS_MODULE,
        .ctr = multipath_ctr,
        .dtr = multipath_dtr,
-        .map = multipath_map,
+        .map_rq = multipath_map,
-        .end_io = multipath_end_io,
+        .rq_end_io = multipath_end_io,
        .presuspend = multipath_presuspend,
        .resume = multipath_resume,
        .status = multipath_status,
        .message = multipath_message,
        .ioctl  = multipath_ioctl,
+        .iterate_devices = multipath_iterate_devices,
+        .busy = multipath_busy,
 };
 static int __init dm_multipath_init(void)
diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h
index 27357b85d73d..e7d1fa8b0459 100644
--- a/drivers/md/dm-path-selector.h
+++ b/drivers/md/dm-path-selector.h
@@ -56,7 +56,8 @@ struct path_selector_type {
         * the path fails.
         */
        struct dm_path *(*select_path) (struct path_selector *ps,
-                                     unsigned *repeat_count);
+                                        unsigned *repeat_count,
+                                        size_t nr_bytes);
        /*
         * Notify the selector that a path has failed.
@@ -75,7 +76,10 @@ struct path_selector_type {
        int (*status) (struct path_selector *ps, struct dm_path *path,
                       status_type_t type, char *result, unsigned int maxlen);
-        int (*end_io) (struct path_selector *ps, struct dm_path *path);
+        int (*start_io) (struct path_selector *ps, struct dm_path *path,
+                         size_t nr_bytes);
+        int (*end_io) (struct path_selector *ps, struct dm_path *path,
+                       size_t nr_bytes);
 };
 /* Register a path selector */
diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c
new file mode 100644
index 000000000000..f92b6cea9d9c
--- /dev/null
+++ b/drivers/md/dm-queue-length.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright (C) 2004-2005 IBM Corp.  All Rights Reserved.
+ * Copyright (C) 2006-2009 NEC Corporation.
+ *
+ * dm-queue-length.c
+ *
+ * Module Author: Stefan Bader, IBM
+ * Modified by: Kiyoshi Ueda, NEC
+ *
+ * This file is released under the GPL.
+ *
+ * queue-length path selector - choose a path with the least number of
+ * in-flight I/Os.
+ */
+#include "dm.h"
+#include "dm-path-selector.h"
+#include <linux/slab.h>
+#include <linux/ctype.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <asm/atomic.h>
+#define DM_MSG_PREFIX   "multipath queue-length"
+#define QL_MIN_IO       128
+#define QL_VERSION      "0.1.0"
+struct selector {
+        struct list_head        valid_paths;
+        struct list_head        failed_paths;
+};
+struct path_info {
+        struct list_head        list;
+        struct dm_path          *path;
+        unsigned                repeat_count;
+        atomic_t                qlen;   /* the number of in-flight I/Os */
+};
+static struct selector *alloc_selector(void)
+{
+        struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
+        if (s) {
+                INIT_LIST_HEAD(&s->valid_paths);
+                INIT_LIST_HEAD(&s->failed_paths);
+        }
+        return s;
+}
+static int ql_create(struct path_selector *ps, unsigned argc, char **argv)
+{
+        struct selector *s = alloc_selector();
+        if (!s)
+                return -ENOMEM;
+        ps->context = s;
+        return 0;
+}
+static void ql_free_paths(struct list_head *paths)
+{
+        struct path_info *pi, *next;
+        list_for_each_entry_safe(pi, next, paths, list) {
+                list_del(&pi->list);
+                kfree(pi);
+        }
+}
+static void ql_destroy(struct path_selector *ps)
+{
+        struct selector *s = ps->context;
+        ql_free_paths(&s->valid_paths);
+        ql_free_paths(&s->failed_paths);
+        kfree(s);
+        ps->context = NULL;
+}
+static int ql_status(struct path_selector *ps, struct dm_path *path,
+                     status_type_t type, char *result, unsigned maxlen)
+{
+        unsigned sz = 0;
+        struct path_info *pi;
+        /* When called with NULL path, return selector status/args. */
+        if (!path)
+                DMEMIT("0 ");
+        else {
+                pi = path->pscontext;
+                switch (type) {
+                case STATUSTYPE_INFO:
+                        DMEMIT("%d ", atomic_read(&pi->qlen));
+                        break;
+                case STATUSTYPE_TABLE:
+                        DMEMIT("%u ", pi->repeat_count);
+                        break;
+                }
+        }
+        return sz;
+}
+static int ql_add_path(struct path_selector *ps, struct dm_path *path,
+                       int argc, char **argv, char **error)
+{
+        struct selector *s = ps->context;
+        struct path_info *pi;
+        unsigned repeat_count = QL_MIN_IO;
+        /*
+         * Arguments: [<repeat_count>]
+         *      <repeat_count>: The number of I/Os before switching path.
+         *                      If not given, default (QL_MIN_IO) is used.
+         */
+        if (argc > 1) {
+                *error = "queue-length ps: incorrect number of arguments";
+                return -EINVAL;
+        }
+        if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
+                *error = "queue-length ps: invalid repeat count";
+                return -EINVAL;
+        }
+        /* Allocate the path information structure */
+        pi = kmalloc(sizeof(*pi), GFP_KERNEL);
+        if (!pi) {
+                *error = "queue-length ps: Error allocating path information";
+                return -ENOMEM;
+        }
+        pi->path = path;
+        pi->repeat_count = repeat_count;
+        atomic_set(&pi->qlen, 0);
+        path->pscontext = pi;
+        list_add_tail(&pi->list, &s->valid_paths);
+        return 0;
+}
+static void ql_fail_path(struct path_selector *ps, struct dm_path *path)
+{
+        struct selector *s = ps->context;
+        struct path_info *pi = path->pscontext;
+        list_move(&pi->list, &s->failed_paths);
+}
+static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path)
+{
+        struct selector *s = ps->context;
+        struct path_info *pi = path->pscontext;
+        list_move_tail(&pi->list, &s->valid_paths);
+        return 0;
+}
+/*
+ * Select a path having the minimum number of in-flight I/Os
+ */
+static struct dm_path *ql_select_path(struct path_selector *ps,
+                                      unsigned *repeat_count, size_t nr_bytes)
+{
+        struct selector *s = ps->context;
+        struct path_info *pi = NULL, *best = NULL;
+        if (list_empty(&s->valid_paths))
+                return NULL;
+        /* Change preferred (first in list) path to evenly balance. */
+        list_move_tail(s->valid_paths.next, &s->valid_paths);
+        list_for_each_entry(pi, &s->valid_paths, list) {
+                if (!best ||
+                    (atomic_read(&pi->qlen) < atomic_read(&best->qlen)))
+                        best = pi;
+                if (!atomic_read(&best->qlen))
+                        break;
+        }
+        if (!best)
+                return NULL;
+        *repeat_count = best->repeat_count;
+        return best->path;
+}
+static int ql_start_io(struct path_selector *ps, struct dm_path *path,
+                       size_t nr_bytes)
+{
+        struct path_info *pi = path->pscontext;
+        atomic_inc(&pi->qlen);
+        return 0;
+}
+static int ql_end_io(struct path_selector *ps, struct dm_path *path,
+                     size_t nr_bytes)
+{
+        struct path_info *pi = path->pscontext;
+        atomic_dec(&pi->qlen);
+        return 0;
+}
+static struct path_selector_type ql_ps = {
+        .name           = "queue-length",
+        .module         = THIS_MODULE,
+        .table_args     = 1,
+        .info_args      = 1,
+        .create         = ql_create,
+        .destroy        = ql_destroy,
+        .status         = ql_status,
+        .add_path       = ql_add_path,
+        .fail_path      = ql_fail_path,
+        .reinstate_path = ql_reinstate_path,
+        .select_path    = ql_select_path,
+        .start_io       = ql_start_io,
+        .end_io         = ql_end_io,
+};
+static int __init dm_ql_init(void)
+{
+        int r = dm_register_path_selector(&ql_ps);
+        if (r < 0)
+                DMERR("register failed %d", r);
+        DMINFO("version " QL_VERSION " loaded");
+        return r;
+}
+static void __exit dm_ql_exit(void)
+{
+        int r = dm_unregister_path_selector(&ql_ps);
+        if (r < 0)
+                DMERR("unregister failed %d", r);
+}
+module_init(dm_ql_init);
+module_exit(dm_ql_exit);
+MODULE_AUTHOR("Stefan Bader <Stefan.Bader at de.ibm.com>");
+MODULE_DESCRIPTION(
+        "(C) Copyright IBM Corp. 2004,2005   All Rights Reserved.\n"
+        DM_NAME " path selector to balance the number of in-flight I/Os"
+);
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 076fbb4e967a..ce8868c768cc 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1283,9 +1283,23 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
        return 0;
 }
+static int mirror_iterate_devices(struct dm_target *ti,
+                                  iterate_devices_callout_fn fn, void *data)
+{
+        struct mirror_set *ms = ti->private;
+        int ret = 0;
+        unsigned i;
+        for (i = 0; !ret && i < ms->nr_mirrors; i++)
+                ret = fn(ti, ms->mirror[i].dev,
+                         ms->mirror[i].offset, data);
+        return ret;
+}
 static struct target_type mirror_target = {
        .name    = "mirror",
-        .version = {1, 0, 20},
+        .version = {1, 12, 0},
        .module  = THIS_MODULE,
        .ctr     = mirror_ctr,
        .dtr     = mirror_dtr,
@@ -1295,6 +1309,7 @@ static struct target_type mirror_target = {
        .postsuspend = mirror_postsuspend,
        .resume  = mirror_resume,
        .status  = mirror_status,
+        .iterate_devices = mirror_iterate_devices,
 };
 static int __init dm_mirror_init(void)
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 7b899be0b087..36dbe29f2fd6 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -283,7 +283,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region)
        nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
        if (unlikely(!nreg))
-                nreg = kmalloc(sizeof(*nreg), GFP_NOIO);
+                nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL);
        nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
                      DM_RH_CLEAN : DM_RH_NOSYNC;
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
index cdfbf65b28cb..24752f449bef 100644
--- a/drivers/md/dm-round-robin.c
+++ b/drivers/md/dm-round-robin.c
@@ -161,7 +161,7 @@ static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p)
 }
 static struct dm_path *rr_select_path(struct path_selector *ps,
-                                   unsigned *repeat_count)
+                                      unsigned *repeat_count, size_t nr_bytes)
 {
        struct selector *s = (struct selector *) ps->context;
        struct path_info *pi = NULL;
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c
new file mode 100644
index 000000000000..cfa668f46c40
--- /dev/null
+++ b/drivers/md/dm-service-time.c
@@ -0,0 +1,339 @@
+/*
+ * Copyright (C) 2007-2009 NEC Corporation.  All Rights Reserved.
+ *
+ * Module Author: Kiyoshi Ueda
+ *
+ * This file is released under the GPL.
+ *
+ * Throughput oriented path selector.
+ */
+#include "dm.h"
+#include "dm-path-selector.h"
+#define DM_MSG_PREFIX   "multipath service-time"
+#define ST_MIN_IO       1
+#define ST_MAX_RELATIVE_THROUGHPUT      100
+#define ST_MAX_RELATIVE_THROUGHPUT_SHIFT        7
+#define ST_MAX_INFLIGHT_SIZE    ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT)
+#define ST_VERSION      "0.2.0"
+struct selector {
+        struct list_head valid_paths;
+        struct list_head failed_paths;
+};
+struct path_info {
+        struct list_head list;
+        struct dm_path *path;
+        unsigned repeat_count;
+        unsigned relative_throughput;
+        atomic_t in_flight_size;        /* Total size of in-flight I/Os */
+};
+static struct selector *alloc_selector(void)
+{
+        struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
+        if (s) {
+                INIT_LIST_HEAD(&s->valid_paths);
+                INIT_LIST_HEAD(&s->failed_paths);
+        }
+        return s;
+}
+static int st_create(struct path_selector *ps, unsigned argc, char **argv)
+{
+        struct selector *s = alloc_selector();
+        if (!s)
+                return -ENOMEM;
+        ps->context = s;
+        return 0;
+}
+static void free_paths(struct list_head *paths)
+{
+        struct path_info *pi, *next;
+        list_for_each_entry_safe(pi, next, paths, list) {
+                list_del(&pi->list);
+                kfree(pi);
+        }
+}
+static void st_destroy(struct path_selector *ps)
+{
+        struct selector *s = ps->context;
+        free_paths(&s->valid_paths);
+        free_paths(&s->failed_paths);
+        kfree(s);
+        ps->context = NULL;
+}
+static int st_status(struct path_selector *ps, struct dm_path *path,
+                     status_type_t type, char *result, unsigned maxlen)
+{
+        unsigned sz = 0;
+        struct path_info *pi;
+        if (!path)
+                DMEMIT("0 ");
+        else {
+                pi = path->pscontext;
+                switch (type) {
+                case STATUSTYPE_INFO:
+                        DMEMIT("%d %u ", atomic_read(&pi->in_flight_size),
+                               pi->relative_throughput);
+                        break;
+                case STATUSTYPE_TABLE:
+                        DMEMIT("%u %u ", pi->repeat_count,
+                               pi->relative_throughput);
+                        break;
+                }
+        }
+        return sz;
+}
+static int st_add_path(struct path_selector *ps, struct dm_path *path,
+                       int argc, char **argv, char **error)
+{
+        struct selector *s = ps->context;
+        struct path_info *pi;
+        unsigned repeat_count = ST_MIN_IO;
+        unsigned relative_throughput = 1;
+        /*
+         * Arguments: [<repeat_count> [<relative_throughput>]]
+         *      <repeat_count>: The number of I/Os before switching path.
+         *                      If not given, default (ST_MIN_IO) is used.
+         *      <relative_throughput>: The relative throughput value of
+         *                      the path among all paths in the path-group.
+         *                      The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT>
+         *                      If not given, minimum value '1' is used.
+         *                      If '0' is given, the path isn't selected while
+         *                      other paths having a positive value are
+         *                      available.
+         */
+        if (argc > 2) {
+                *error = "service-time ps: incorrect number of arguments";
+                return -EINVAL;
+        }
+        if (argc && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
+                *error = "service-time ps: invalid repeat count";
+                return -EINVAL;
+        }
+        if ((argc == 2) &&
+            (sscanf(argv[1], "%u", &relative_throughput) != 1 ||
+             relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) {
+                *error = "service-time ps: invalid relative_throughput value";
+                return -EINVAL;
+        }
+        /* allocate the path */
+        pi = kmalloc(sizeof(*pi), GFP_KERNEL);
+        if (!pi) {
+                *error = "service-time ps: Error allocating path context";
+                return -ENOMEM;
+        }
+        pi->path = path;
+        pi->repeat_count = repeat_count;
+        pi->relative_throughput = relative_throughput;
+        atomic_set(&pi->in_flight_size, 0);
+        path->pscontext = pi;
+        list_add_tail(&pi->list, &s->valid_paths);
+        return 0;
+}
+static void st_fail_path(struct path_selector *ps, struct dm_path *path)
+{
+        struct selector *s = ps->context;
+        struct path_info *pi = path->pscontext;
+        list_move(&pi->list, &s->failed_paths);
+}
+static int st_reinstate_path(struct path_selector *ps, struct dm_path *path)
+{
+        struct selector *s = ps->context;
+        struct path_info *pi = path->pscontext;
+        list_move_tail(&pi->list, &s->valid_paths);
+        return 0;
+}
+/*
+ * Compare the estimated service time of 2 paths, pi1 and pi2,
+ * for the incoming I/O.
+ *
+ * Returns:
+ * < 0 : pi1 is better
+ * 0   : no difference between pi1 and pi2
+ * > 0 : pi2 is better
+ *
+ * Description:
+ * Basically, the service time is estimated by:
+ *     ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput'
+ * To reduce the calculation, some optimizations are made.
+ * (See comments inline)
+ */
+static int st_compare_load(struct path_info *pi1, struct path_info *pi2,
+                           size_t incoming)
+{
+        size_t sz1, sz2, st1, st2;
+        sz1 = atomic_read(&pi1->in_flight_size);
+        sz2 = atomic_read(&pi2->in_flight_size);
+        /*
+         * Case 1: Both have same throughput value. Choose less loaded path.
+         */
+        if (pi1->relative_throughput == pi2->relative_throughput)
+                return sz1 - sz2;
+        /*
+         * Case 2a: Both have same load. Choose higher throughput path.
+         * Case 2b: One path has no throughput value. Choose the other one.
+         */
+        if (sz1 == sz2 ||
+            !pi1->relative_throughput || !pi2->relative_throughput)
+                return pi2->relative_throughput - pi1->relative_throughput;
+        /*
+         * Case 3: Calculate service time. Choose faster path.
+         *         Service time using pi1:
+         *             st1 = (sz1 + incoming) / pi1->relative_throughput
+         *         Service time using pi2:
+         *             st2 = (sz2 + incoming) / pi2->relative_throughput
+         *
+         *         To avoid the division, transform the expression to use
+         *         multiplication.
+         *         Because ->relative_throughput > 0 here, if st1 < st2,
+         *         the expressions below are the same meaning:
+         *             (sz1 + incoming) / pi1->relative_throughput <
+         *                 (sz2 + incoming) / pi2->relative_throughput
+         *             (sz1 + incoming) * pi2->relative_throughput <
+         *                 (sz2 + incoming) * pi1->relative_throughput
+         *         So use the later one.
+         */
+        sz1 += incoming;
+        sz2 += incoming;
+        if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE ||
+                     sz2 >= ST_MAX_INFLIGHT_SIZE)) {
+                /*
+                 * Size may be too big for multiplying pi->relative_throughput
+                 * and overflow.
+                 * To avoid the overflow and mis-selection, shift down both.
+                 */
+                sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
+                sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
+        }
+        st1 = sz1 * pi2->relative_throughput;
+        st2 = sz2 * pi1->relative_throughput;
+        if (st1 != st2)
+                return st1 - st2;
+        /*
+         * Case 4: Service time is equal. Choose higher throughput path.
+         */
+        return pi2->relative_throughput - pi1->relative_throughput;
+}
+static struct dm_path *st_select_path(struct path_selector *ps,
+                                      unsigned *repeat_count, size_t nr_bytes)
+{
+        struct selector *s = ps->context;
+        struct path_info *pi = NULL, *best = NULL;
+        if (list_empty(&s->valid_paths))
+                return NULL;
+        /* Change preferred (first in list) path to evenly balance. */
+        list_move_tail(s->valid_paths.next, &s->valid_paths);
+        list_for_each_entry(pi, &s->valid_paths, list)
+                if (!best || (st_compare_load(pi, best, nr_bytes) < 0))
+                        best = pi;
+        if (!best)
+                return NULL;
+        *repeat_count = best->repeat_count;
+        return best->path;
+}
+static int st_start_io(struct path_selector *ps, struct dm_path *path,
+                       size_t nr_bytes)
+{
+        struct path_info *pi = path->pscontext;
+        atomic_add(nr_bytes, &pi->in_flight_size);
+        return 0;
+}
+static int st_end_io(struct path_selector *ps, struct dm_path *path,
+                     size_t nr_bytes)
+{
+        struct path_info *pi = path->pscontext;
+        atomic_sub(nr_bytes, &pi->in_flight_size);
+        return 0;
+}
+static struct path_selector_type st_ps = {
+        .name           = "service-time",
+        .module         = THIS_MODULE,
+        .table_args     = 2,
+        .info_args      = 2,
+        .create         = st_create,
+        .destroy        = st_destroy,
+        .status         = st_status,
+        .add_path       = st_add_path,
+        .fail_path      = st_fail_path,
+        .reinstate_path = st_reinstate_path,
+        .select_path    = st_select_path,
+        .start_io       = st_start_io,
+        .end_io         = st_end_io,
+};
+static int __init dm_st_init(void)
+{
+        int r = dm_register_path_selector(&st_ps);
+        if (r < 0)
+                DMERR("register failed %d", r);
+        DMINFO("version " ST_VERSION " loaded");
+        return r;
+}
+static void __exit dm_st_exit(void)
+{
+        int r = dm_unregister_path_selector(&st_ps);
+        if (r < 0)
+                DMERR("unregister failed %d", r);
+}
+module_init(dm_st_init);
+module_exit(dm_st_exit);
+MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector");
+MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index e75c6dd76a9a..6e3fe4f14934 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -282,7 +282,7 @@ static int read_header(struct pstore *ps, int *new_snapshot)
         */
        if (!ps->store->chunk_size) {
                ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
-                    bdev_hardsect_size(ps->store->cow->bdev) >> 9);
+                    bdev_logical_block_size(ps->store->cow->bdev) >> 9);
                ps->store->chunk_mask = ps->store->chunk_size - 1;
                ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1;
                chunk_size_supplied = 0;
@@ -636,7 +636,7 @@ static void persistent_commit_exception(struct dm_exception_store *store,
        /*
         * Commit exceptions to disk.
         */
-        if (ps->valid && area_io(ps, WRITE))
+        if (ps->valid && area_io(ps, WRITE_BARRIER))
                ps->valid = 0;
        /*
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index d73f17fc7778..d573165cd2b7 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -678,6 +678,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        ti->private = s;
        ti->split_io = s->store->chunk_size;
+        ti->num_flush_requests = 1;
        return 0;
@@ -1030,6 +1031,11 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
        chunk_t chunk;
        struct dm_snap_pending_exception *pe = NULL;
+        if (unlikely(bio_empty_barrier(bio))) {
+                bio->bi_bdev = s->store->cow->bdev;
+                return DM_MAPIO_REMAPPED;
+        }
        chunk = sector_to_chunk(s->store, bio->bi_sector);
        /* Full snapshots are not usable */
@@ -1338,6 +1344,8 @@ static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        }
        ti->private = dev;
+        ti->num_flush_requests = 1;
        return 0;
 }
@@ -1353,6 +1361,9 @@ static int origin_map(struct dm_target *ti, struct bio *bio,
        struct dm_dev *dev = ti->private;
        bio->bi_bdev = dev->bdev;
+        if (unlikely(bio_empty_barrier(bio)))
+                return DM_MAPIO_REMAPPED;
        /* Only tell snapshots if this is a write */
        return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED;
 }
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 41569bc60abc..b240e85ae39a 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -167,6 +167,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        sc->stripes = stripes;
        sc->stripe_width = width;
        ti->split_io = chunk_size;
+        ti->num_flush_requests = stripes;
        sc->chunk_mask = ((sector_t) chunk_size) - 1;
        for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++)
@@ -211,10 +212,18 @@ static int stripe_map(struct dm_target *ti, struct bio *bio,
                      union map_info *map_context)
 {
        struct stripe_c *sc = (struct stripe_c *) ti->private;
+        sector_t offset, chunk;
+        uint32_t stripe;
-        sector_t offset = bio->bi_sector - ti->begin;
+        if (unlikely(bio_empty_barrier(bio))) {
-        sector_t chunk = offset >> sc->chunk_shift;
+                BUG_ON(map_context->flush_request >= sc->stripes);
-        uint32_t stripe = sector_div(chunk, sc->stripes);
+                bio->bi_bdev = sc->stripe[map_context->flush_request].dev->bdev;
+                return DM_MAPIO_REMAPPED;
+        }
+        offset = bio->bi_sector - ti->begin;
+        chunk = offset >> sc->chunk_shift;
+        stripe = sector_div(chunk, sc->stripes);
        bio->bi_bdev = sc->stripe[stripe].dev->bdev;
        bio->bi_sector = sc->stripe[stripe].physical_start +
@@ -304,15 +313,31 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio,
        return error;
 }
+static int stripe_iterate_devices(struct dm_target *ti,
+                                  iterate_devices_callout_fn fn, void *data)
+{
+        struct stripe_c *sc = ti->private;
+        int ret = 0;
+        unsigned i = 0;
+        do
+                ret = fn(ti, sc->stripe[i].dev,
+                         sc->stripe[i].physical_start, data);
+        while (!ret && ++i < sc->stripes);
+        return ret;
+}
 static struct target_type stripe_target = {
        .name   = "striped",
-        .version = {1, 1, 0},
+        .version = {1, 2, 0},
        .module = THIS_MODULE,
        .ctr    = stripe_ctr,
        .dtr    = stripe_dtr,
        .map    = stripe_map,
        .end_io = stripe_end_io,
        .status = stripe_status,
+        .iterate_devices = stripe_iterate_devices,
 };
 int __init dm_stripe_init(void)
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index a2a45e6c7c8b..4b045903a4e2 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -57,12 +57,21 @@ static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf)
        return strlen(buf);
 }
+static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf)
+{
+        sprintf(buf, "%d\n", dm_suspended(md));
+        return strlen(buf);
+}
 static DM_ATTR_RO(name);
 static DM_ATTR_RO(uuid);
+static DM_ATTR_RO(suspended);
 static struct attribute *dm_attrs[] = {
        &dm_attr_name.attr,
        &dm_attr_uuid.attr,
+        &dm_attr_suspended.attr,
        NULL,
 };
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 429b50b975d5..4899ebe767c8 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -41,6 +41,7 @@
 struct dm_table {
        struct mapped_device *md;
        atomic_t holders;
+        unsigned type;
        /* btree table */
        unsigned int depth;
@@ -62,15 +63,11 @@ struct dm_table {
        /* a list of devices used by this table */
        struct list_head devices;
-        /*
-         * These are optimistic limits taken from all the
-         * targets, some targets will need smaller limits.
-         */
-        struct io_restrictions limits;
        /* events get handed up using this callback */
        void (*event_fn)(void *);
        void *event_context;
+        struct dm_md_mempools *mempools;
 };
 /*
@@ -89,42 +86,6 @@ static unsigned int int_log(unsigned int n, unsigned int base)
 }
 /*
- * Returns the minimum that is _not_ zero, unless both are zero.
- */
-#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
-/*
- * Combine two io_restrictions, always taking the lower value.
- */
-static void combine_restrictions_low(struct io_restrictions *lhs,
-                                     struct io_restrictions *rhs)
-{
-        lhs->max_sectors =
-                min_not_zero(lhs->max_sectors, rhs->max_sectors);
-        lhs->max_phys_segments =
-                min_not_zero(lhs->max_phys_segments, rhs->max_phys_segments);
-        lhs->max_hw_segments =
-                min_not_zero(lhs->max_hw_segments, rhs->max_hw_segments);
-        lhs->hardsect_size = max(lhs->hardsect_size, rhs->hardsect_size);
-        lhs->max_segment_size =
-                min_not_zero(lhs->max_segment_size, rhs->max_segment_size);
-        lhs->max_hw_sectors =
-                min_not_zero(lhs->max_hw_sectors, rhs->max_hw_sectors);
-        lhs->seg_boundary_mask =
-                min_not_zero(lhs->seg_boundary_mask, rhs->seg_boundary_mask);
-        lhs->bounce_pfn = min_not_zero(lhs->bounce_pfn, rhs->bounce_pfn);
-        lhs->no_cluster |= rhs->no_cluster;
-}
-/*
 * Calculate the index of the child node of the n'th node k'th key.
 */
 static inline unsigned int get_child(unsigned int n, unsigned int k)
@@ -266,6 +227,8 @@ static void free_devices(struct list_head *devices)
        list_for_each_safe(tmp, next, devices) {
                struct dm_dev_internal *dd =
                    list_entry(tmp, struct dm_dev_internal, list);
+                DMWARN("dm_table_destroy: dm_put_device call missing for %s",
+                       dd->dm_dev.name);
                kfree(dd);
        }
 }
@@ -295,12 +258,10 @@ void dm_table_destroy(struct dm_table *t)
        vfree(t->highs);
        /* free the device list */
-        if (t->devices.next != &t->devices) {
+        if (t->devices.next != &t->devices)
-                DMWARN("devices still present during destroy: "
-                       "dm_table_remove_device calls missing");
                free_devices(&t->devices);
-        }
+        dm_free_md_mempools(t->mempools);
        kfree(t);
 }
@@ -384,15 +345,48 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
 /*
 * If possible, this checks an area of a destination device is valid.
 */
-static int check_device_area(struct dm_dev_internal *dd, sector_t start,
+static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev,
-                             sector_t len)
+                                sector_t start, void *data)
 {
-        sector_t dev_size = dd->dm_dev.bdev->bd_inode->i_size >> SECTOR_SHIFT;
+        struct queue_limits *limits = data;
+        struct block_device *bdev = dev->bdev;
+        sector_t dev_size =
+                i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
+        unsigned short logical_block_size_sectors =
+                limits->logical_block_size >> SECTOR_SHIFT;
+        char b[BDEVNAME_SIZE];
        if (!dev_size)
                return 1;
-        return ((start < dev_size) && (len <= (dev_size - start)));
+        if ((start >= dev_size) || (start + ti->len > dev_size)) {
+                DMWARN("%s: %s too small for target",
+                       dm_device_name(ti->table->md), bdevname(bdev, b));
+                return 0;
+        }
+        if (logical_block_size_sectors <= 1)
+                return 1;
+        if (start & (logical_block_size_sectors - 1)) {
+                DMWARN("%s: start=%llu not aligned to h/w "
+                       "logical block size %hu of %s",
+                       dm_device_name(ti->table->md),
+                       (unsigned long long)start,
+                       limits->logical_block_size, bdevname(bdev, b));
+                return 0;
+        }
+        if (ti->len & (logical_block_size_sectors - 1)) {
+                DMWARN("%s: len=%llu not aligned to h/w "
+                       "logical block size %hu of %s",
+                       dm_device_name(ti->table->md),
+                       (unsigned long long)ti->len,
+                       limits->logical_block_size, bdevname(bdev, b));
+                return 0;
+        }
+        return 1;
 }
 /*
@@ -478,38 +472,32 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
        }
        atomic_inc(&dd->count);
-        if (!check_device_area(dd, start, len)) {
-                DMWARN("device %s too small for target", path);
-                dm_put_device(ti, &dd->dm_dev);
-                return -EINVAL;
-        }
        *result = &dd->dm_dev;
        return 0;
 }
-void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
+/*
+ * Returns the minimum that is _not_ zero, unless both are zero.
+ */
+#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
+int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
+                         sector_t start, void *data)
 {
+        struct queue_limits *limits = data;
+        struct block_device *bdev = dev->bdev;
        struct request_queue *q = bdev_get_queue(bdev);
-        struct io_restrictions *rs = &ti->limits;
        char b[BDEVNAME_SIZE];
        if (unlikely(!q)) {
                DMWARN("%s: Cannot set limits for nonexistent device %s",
                       dm_device_name(ti->table->md), bdevname(bdev, b));
-                return;
+                return 0;
        }
-        /*
+        if (blk_stack_limits(limits, &q->limits, start) < 0)
-         * Combine the device limits low.
+                DMWARN("%s: target device %s is misaligned",
-         *
+                       dm_device_name(ti->table->md), bdevname(bdev, b));
-         * FIXME: if we move an io_restriction struct
-         *        into q this would just be a call to
-         *        combine_restrictions_low()
-         */
-        rs->max_sectors =
-                min_not_zero(rs->max_sectors, q->max_sectors);
        /*
         * Check if merge fn is supported.
@@ -518,47 +506,21 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
         */
        if (q->merge_bvec_fn && !ti->type->merge)
-                rs->max_sectors =
+                limits->max_sectors =
-                        min_not_zero(rs->max_sectors,
+                        min_not_zero(limits->max_sectors,
                                     (unsigned int) (PAGE_SIZE >> 9));
+        return 0;
-        rs->max_phys_segments =
-                min_not_zero(rs->max_phys_segments,
-                             q->max_phys_segments);
-        rs->max_hw_segments =
-                min_not_zero(rs->max_hw_segments, q->max_hw_segments);
-        rs->hardsect_size = max(rs->hardsect_size, q->hardsect_size);
-        rs->max_segment_size =
-                min_not_zero(rs->max_segment_size, q->max_segment_size);
-        rs->max_hw_sectors =
-                min_not_zero(rs->max_hw_sectors, q->max_hw_sectors);
-        rs->seg_boundary_mask =
-                min_not_zero(rs->seg_boundary_mask,
-                             q->seg_boundary_mask);
-        rs->bounce_pfn = min_not_zero(rs->bounce_pfn, q->bounce_pfn);
-        rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
 }
 EXPORT_SYMBOL_GPL(dm_set_device_limits);
 int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
                  sector_t len, fmode_t mode, struct dm_dev **result)
 {
-        int r = __table_get_device(ti->table, ti, path,
+        return __table_get_device(ti->table, ti, path,
-                                   start, len, mode, result);
+                                  start, len, mode, result);
-        if (!r)
-                dm_set_device_limits(ti, (*result)->bdev);
-        return r;
 }
 /*
 * Decrement a devices use count and remove it if necessary.
 */
@@ -673,24 +635,78 @@ int dm_split_args(int *argc, char ***argvp, char *input)
        return 0;
 }
-static void check_for_valid_limits(struct io_restrictions *rs)
+/*
+ * Impose necessary and sufficient conditions on a devices's table such
+ * that any incoming bio which respects its logical_block_size can be
+ * processed successfully.  If it falls across the boundary between
+ * two or more targets, the size of each piece it gets split into must
+ * be compatible with the logical_block_size of the target processing it.
+ */
+static int validate_hardware_logical_block_alignment(struct dm_table *table,
+                                                 struct queue_limits *limits)
 {
-        if (!rs->max_sectors)
+        /*
-                rs->max_sectors = SAFE_MAX_SECTORS;
+         * This function uses arithmetic modulo the logical_block_size
-        if (!rs->max_hw_sectors)
+         * (in units of 512-byte sectors).
-                rs->max_hw_sectors = SAFE_MAX_SECTORS;
+         */
-        if (!rs->max_phys_segments)
+        unsigned short device_logical_block_size_sects =
-                rs->max_phys_segments = MAX_PHYS_SEGMENTS;
+                limits->logical_block_size >> SECTOR_SHIFT;
-        if (!rs->max_hw_segments)
-                rs->max_hw_segments = MAX_HW_SEGMENTS;
+        /*
-        if (!rs->hardsect_size)
+         * Offset of the start of the next table entry, mod logical_block_size.
-                rs->hardsect_size = 1 << SECTOR_SHIFT;
+         */
-        if (!rs->max_segment_size)
+        unsigned short next_target_start = 0;
-                rs->max_segment_size = MAX_SEGMENT_SIZE;
-        if (!rs->seg_boundary_mask)
+        /*
-                rs->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
+         * Given an aligned bio that extends beyond the end of a
-        if (!rs->bounce_pfn)
+         * target, how many sectors must the next target handle?
-                rs->bounce_pfn = -1;
+         */
+        unsigned short remaining = 0;
+        struct dm_target *uninitialized_var(ti);
+        struct queue_limits ti_limits;
+        unsigned i = 0;
+        /*
+         * Check each entry in the table in turn.
+         */
+        while (i < dm_table_get_num_targets(table)) {
+                ti = dm_table_get_target(table, i++);
+                blk_set_default_limits(&ti_limits);
+                /* combine all target devices' limits */
+                if (ti->type->iterate_devices)
+                        ti->type->iterate_devices(ti, dm_set_device_limits,
+                                                  &ti_limits);
+                /*
+                 * If the remaining sectors fall entirely within this
+                 * table entry are they compatible with its logical_block_size?
+                 */
+                if (remaining < ti->len &&
+                    remaining & ((ti_limits.logical_block_size >>
+                                  SECTOR_SHIFT) - 1))
+                        break;  /* Error */
+                next_target_start =
+                    (unsigned short) ((next_target_start + ti->len) &
+                                      (device_logical_block_size_sects - 1));
+                remaining = next_target_start ?
+                    device_logical_block_size_sects - next_target_start : 0;
+        }
+        if (remaining) {
+                DMWARN("%s: table line %u (start sect %llu len %llu) "
+                       "not aligned to h/w logical block size %hu",
+                       dm_device_name(table->md), i,
+                       (unsigned long long) ti->begin,
+                       (unsigned long long) ti->len,
+                       limits->logical_block_size);
+                return -EINVAL;
+        }
+        return 0;
 }
 int dm_table_add_target(struct dm_table *t, const char *type,
@@ -745,9 +761,6 @@ int dm_table_add_target(struct dm_table *t, const char *type,
        t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
-        /* FIXME: the plan is to combine high here and then have
-         * the merge fn apply the target level restrictions. */
-        combine_restrictions_low(&t->limits, &tgt->limits);
        return 0;
 bad:
@@ -756,6 +769,104 @@ int dm_table_add_target(struct dm_table *t, const char *type,
        return r;
 }
+int dm_table_set_type(struct dm_table *t)
+{
+        unsigned i;
+        unsigned bio_based = 0, request_based = 0;
+        struct dm_target *tgt;
+        struct dm_dev_internal *dd;
+        struct list_head *devices;
+        for (i = 0; i < t->num_targets; i++) {
+                tgt = t->targets + i;
+                if (dm_target_request_based(tgt))
+                        request_based = 1;
+                else
+                        bio_based = 1;
+                if (bio_based && request_based) {
+                        DMWARN("Inconsistent table: different target types"
+                               " can't be mixed up");
+                        return -EINVAL;
+                }
+        }
+        if (bio_based) {
+                /* We must use this table as bio-based */
+                t->type = DM_TYPE_BIO_BASED;
+                return 0;
+        }
+        BUG_ON(!request_based); /* No targets in this table */
+        /* Non-request-stackable devices can't be used for request-based dm */
+        devices = dm_table_get_devices(t);
+        list_for_each_entry(dd, devices, list) {
+                if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev.bdev))) {
+                        DMWARN("table load rejected: including"
+                               " non-request-stackable devices");
+                        return -EINVAL;
+                }
+        }
+        /*
+         * Request-based dm supports only tables that have a single target now.
+         * To support multiple targets, request splitting support is needed,
+         * and that needs lots of changes in the block-layer.
+         * (e.g. request completion process for partial completion.)
+         */
+        if (t->num_targets > 1) {
+                DMWARN("Request-based dm doesn't support multiple targets yet");
+                return -EINVAL;
+        }
+        t->type = DM_TYPE_REQUEST_BASED;
+        return 0;
+}
+unsigned dm_table_get_type(struct dm_table *t)
+{
+        return t->type;
+}
+bool dm_table_bio_based(struct dm_table *t)
+{
+        return dm_table_get_type(t) == DM_TYPE_BIO_BASED;
+}
+bool dm_table_request_based(struct dm_table *t)
+{
+        return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED;
+}
+int dm_table_alloc_md_mempools(struct dm_table *t)
+{
+        unsigned type = dm_table_get_type(t);
+        if (unlikely(type == DM_TYPE_NONE)) {
+                DMWARN("no table type is set, can't allocate mempools");
+                return -EINVAL;
+        }
+        t->mempools = dm_alloc_md_mempools(type);
+        if (!t->mempools)
+                return -ENOMEM;
+        return 0;
+}
+void dm_table_free_md_mempools(struct dm_table *t)
+{
+        dm_free_md_mempools(t->mempools);
+        t->mempools = NULL;
+}
+struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t)
+{
+        return t->mempools;
+}
 static int setup_indexes(struct dm_table *t)
 {
        int i;
@@ -790,8 +901,6 @@ int dm_table_complete(struct dm_table *t)
        int r = 0;
        unsigned int leaf_nodes;
-        check_for_valid_limits(&t->limits);
        /* how many indexes will the btree have ? */
        leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
        t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
@@ -867,6 +976,57 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
 }
 /*
+ * Establish the new table's queue_limits and validate them.
+ */
+int dm_calculate_queue_limits(struct dm_table *table,
+                              struct queue_limits *limits)
+{
+        struct dm_target *uninitialized_var(ti);
+        struct queue_limits ti_limits;
+        unsigned i = 0;
+        blk_set_default_limits(limits);
+        while (i < dm_table_get_num_targets(table)) {
+                blk_set_default_limits(&ti_limits);
+                ti = dm_table_get_target(table, i++);
+                if (!ti->type->iterate_devices)
+                        goto combine_limits;
+                /*
+                 * Combine queue limits of all the devices this target uses.
+                 */
+                ti->type->iterate_devices(ti, dm_set_device_limits,
+                                          &ti_limits);
+                /*
+                 * Check each device area is consistent with the target's
+                 * overall queue limits.
+                 */
+                if (!ti->type->iterate_devices(ti, device_area_is_valid,
+                                               &ti_limits))
+                        return -EINVAL;
+combine_limits:
+                /*
+                 * Merge this target's queue limits into the overall limits
+                 * for the table.
+                 */
+                if (blk_stack_limits(limits, &ti_limits, 0) < 0)
+                        DMWARN("%s: target device "
+                               "(start sect %llu len %llu) "
+                               "is misaligned",
+                               dm_device_name(table->md),
+                               (unsigned long long) ti->begin,
+                               (unsigned long long) ti->len);
+        }
+        return validate_hardware_logical_block_alignment(table, limits);
+}
+/*
 * Set the integrity profile for this device if all devices used have
 * matching profiles.
 */
@@ -905,27 +1065,42 @@ no_integrity:
        return;
 }
-void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
+void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
+                               struct queue_limits *limits)
 {
        /*
-         * Make sure we obey the optimistic sub devices
+         * Each target device in the table has a data area that should normally
-         * restrictions.
+         * be aligned such that the DM device's alignment_offset is 0.
+         * FIXME: Propagate alignment_offsets up the stack and warn of
+         *        sub-optimal or inconsistent settings.
+         */
+        limits->alignment_offset = 0;
+        limits->misaligned = 0;
+        /*
+         * Copy table's limits to the DM device's request_queue
         */
-        blk_queue_max_sectors(q, t->limits.max_sectors);
+        q->limits = *limits;
-        q->max_phys_segments = t->limits.max_phys_segments;
-        q->max_hw_segments = t->limits.max_hw_segments;
+        if (limits->no_cluster)
-        q->hardsect_size = t->limits.hardsect_size;
-        q->max_segment_size = t->limits.max_segment_size;
-        q->max_hw_sectors = t->limits.max_hw_sectors;
-        q->seg_boundary_mask = t->limits.seg_boundary_mask;
-        q->bounce_pfn = t->limits.bounce_pfn;
-        if (t->limits.no_cluster)
                queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q);
        else
                queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q);
        dm_table_set_integrity(t);
+        /*
+         * QUEUE_FLAG_STACKABLE must be set after all queue settings are
+         * visible to other CPUs because, once the flag is set, incoming bios
+         * are processed by request-based dm, which refers to the queue
+         * settings.
+         * Until the flag set, bios are passed to bio-based dm and queued to
+         * md->deferred where queue settings are not needed yet.
+         * Those bios are passed to request-based dm at the resume time.
+         */
+        smp_mb();
+        if (dm_table_request_based(t))
+                queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q);
 }
 unsigned int dm_table_get_num_targets(struct dm_table *t)
@@ -1021,6 +1196,20 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
        return r;
 }
+int dm_table_any_busy_target(struct dm_table *t)
+{
+        unsigned i;
+        struct dm_target *ti;
+        for (i = 0; i < t->num_targets; i++) {
+                ti = t->targets + i;
+                if (ti->type->busy && ti->type->busy(ti))
+                        return 1;
+        }
+        return 0;
+}
 void dm_table_unplug_all(struct dm_table *t)
 {
        struct dm_dev_internal *dd;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 424f7b048c30..3c6d4ee8921d 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -19,11 +19,18 @@
 #include <linux/slab.h>
 #include <linux/idr.h>
 #include <linux/hdreg.h>
-#include <linux/blktrace_api.h>
-#include <trace/block.h>
+#include <trace/events/block.h>
 #define DM_MSG_PREFIX "core"
+/*
+ * Cookies are numeric values sent with CHANGE and REMOVE
+ * uevents while resuming, removing or renaming the device.
+ */
+#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
+#define DM_COOKIE_LENGTH 24
 static const char *_name = DM_NAME;
 static unsigned int major = 0;
@@ -53,8 +60,6 @@ struct dm_target_io {
        union map_info info;
 };
-DEFINE_TRACE(block_bio_complete);
 /*
 * For request-based dm.
 * One of these is allocated per request.
@@ -73,7 +78,7 @@ struct dm_rq_target_io {
 */
 struct dm_rq_clone_bio_info {
        struct bio *orig;
-        struct request *rq;
+        struct dm_rq_target_io *tio;
 };
 union map_info *dm_get_mapinfo(struct bio *bio)
@@ -83,6 +88,14 @@ union map_info *dm_get_mapinfo(struct bio *bio)
        return NULL;
 }
+union map_info *dm_get_rq_mapinfo(struct request *rq)
+{
+        if (rq && rq->end_io_data)
+                return &((struct dm_rq_target_io *)rq->end_io_data)->info;
+        return NULL;
+}
+EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
 #define MINOR_ALLOCED ((void *)-1)
 /*
@@ -159,13 +172,31 @@ struct mapped_device {
         * freeze/thaw support require holding onto a super block
         */
        struct super_block *frozen_sb;
-        struct block_device *suspended_bdev;
+        struct block_device *bdev;
        /* forced geometry settings */
        struct hd_geometry geometry;
+        /* marker of flush suspend for request-based dm */
+        struct request suspend_rq;
+        /* For saving the address of __make_request for request based dm */
+        make_request_fn *saved_make_request_fn;
        /* sysfs handle */
        struct kobject kobj;
+        /* zero-length barrier that will be cloned and submitted to targets */
+        struct bio barrier_bio;
+};
+/*
+ * For mempools pre-allocation at the table loading time.
+ */
+struct dm_md_mempools {
+        mempool_t *io_pool;
+        mempool_t *tio_pool;
+        struct bio_set *bs;
 };
 #define MIN_IOS 256
@@ -393,14 +424,29 @@ static void free_io(struct mapped_device *md, struct dm_io *io)
        mempool_free(io, md->io_pool);
 }
-static struct dm_target_io *alloc_tio(struct mapped_device *md)
+static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
+{
+        mempool_free(tio, md->tio_pool);
+}
+static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md)
 {
-        return mempool_alloc(md->tio_pool, GFP_NOIO);
+        return mempool_alloc(md->tio_pool, GFP_ATOMIC);
 }
-static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
+static void free_rq_tio(struct dm_rq_target_io *tio)
 {
-        mempool_free(tio, md->tio_pool);
+        mempool_free(tio, tio->md->tio_pool);
+}
+static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md)
+{
+        return mempool_alloc(md->io_pool, GFP_ATOMIC);
+}
+static void free_bio_info(struct dm_rq_clone_bio_info *info)
+{
+        mempool_free(info, info->tio->md->io_pool);
 }
 static void start_io_acct(struct dm_io *io)
@@ -466,12 +512,13 @@ static void queue_io(struct mapped_device *md, struct bio *bio)
 struct dm_table *dm_get_table(struct mapped_device *md)
 {
        struct dm_table *t;
+        unsigned long flags;
-        read_lock(&md->map_lock);
+        read_lock_irqsave(&md->map_lock, flags);
        t = md->map;
        if (t)
                dm_table_get(t);
-        read_unlock(&md->map_lock);
+        read_unlock_irqrestore(&md->map_lock, flags);
        return t;
 }
@@ -538,9 +585,11 @@ static void dec_pending(struct dm_io *io, int error)
                         * Target requested pushing back the I/O.
                         */
                        spin_lock_irqsave(&md->deferred_lock, flags);
-                        if (__noflush_suspending(md))
+                        if (__noflush_suspending(md)) {
-                                bio_list_add_head(&md->deferred, io->bio);
+                                if (!bio_barrier(io->bio))
-                        else
+                                        bio_list_add_head(&md->deferred,
+                                                          io->bio);
+                        } else
                                /* noflush suspend was interrupted. */
                                io->error = -EIO;
                        spin_unlock_irqrestore(&md->deferred_lock, flags);
@@ -555,7 +604,8 @@ static void dec_pending(struct dm_io *io, int error)
                         * a per-device variable for error reporting.
                         * Note that you can't touch the bio after end_io_acct
                         */
-                        md->barrier_error = io_error;
+                        if (!md->barrier_error && io_error != -EOPNOTSUPP)
+                                md->barrier_error = io_error;
                        end_io_acct(io);
                } else {
                        end_io_acct(io);
@@ -609,6 +659,262 @@ static void clone_endio(struct bio *bio, int error)
        dec_pending(io, error);
 }
+/*
+ * Partial completion handling for request-based dm
+ */
+static void end_clone_bio(struct bio *clone, int error)
+{
+        struct dm_rq_clone_bio_info *info = clone->bi_private;
+        struct dm_rq_target_io *tio = info->tio;
+        struct bio *bio = info->orig;
+        unsigned int nr_bytes = info->orig->bi_size;
+        bio_put(clone);
+        if (tio->error)
+                /*
+                 * An error has already been detected on the request.
+                 * Once error occurred, just let clone->end_io() handle
+                 * the remainder.
+                 */
+                return;
+        else if (error) {
+                /*
+                 * Don't notice the error to the upper layer yet.
+                 * The error handling decision is made by the target driver,
+                 * when the request is completed.
+                 */
+                tio->error = error;
+                return;
+        }
+        /*
+         * I/O for the bio successfully completed.
+         * Notice the data completion to the upper layer.
+         */
+        /*
+         * bios are processed from the head of the list.
+         * So the completing bio should always be rq->bio.
+         * If it's not, something wrong is happening.
+         */
+        if (tio->orig->bio != bio)
+                DMERR("bio completion is going in the middle of the request");
+        /*
+         * Update the original request.
+         * Do not use blk_end_request() here, because it may complete
+         * the original request before the clone, and break the ordering.
+         */
+        blk_update_request(tio->orig, 0, nr_bytes);
+}
+/*
+ * Don't touch any member of the md after calling this function because
+ * the md may be freed in dm_put() at the end of this function.
+ * Or do dm_get() before calling this function and dm_put() later.
+ */
+static void rq_completed(struct mapped_device *md, int run_queue)
+{
+        int wakeup_waiters = 0;
+        struct request_queue *q = md->queue;
+        unsigned long flags;
+        spin_lock_irqsave(q->queue_lock, flags);
+        if (!queue_in_flight(q))
+                wakeup_waiters = 1;
+        spin_unlock_irqrestore(q->queue_lock, flags);
+        /* nudge anyone waiting on suspend queue */
+        if (wakeup_waiters)
+                wake_up(&md->wait);
+        if (run_queue)
+                blk_run_queue(q);
+        /*
+         * dm_put() must be at the end of this function. See the comment above
+         */
+        dm_put(md);
+}
+static void dm_unprep_request(struct request *rq)
+{
+        struct request *clone = rq->special;
+        struct dm_rq_target_io *tio = clone->end_io_data;
+        rq->special = NULL;
+        rq->cmd_flags &= ~REQ_DONTPREP;
+        blk_rq_unprep_clone(clone);
+        free_rq_tio(tio);
+}
+/*
+ * Requeue the original request of a clone.
+ */
+void dm_requeue_unmapped_request(struct request *clone)
+{
+        struct dm_rq_target_io *tio = clone->end_io_data;
+        struct mapped_device *md = tio->md;
+        struct request *rq = tio->orig;
+        struct request_queue *q = rq->q;
+        unsigned long flags;
+        dm_unprep_request(rq);
+        spin_lock_irqsave(q->queue_lock, flags);
+        if (elv_queue_empty(q))
+                blk_plug_device(q);
+        blk_requeue_request(q, rq);
+        spin_unlock_irqrestore(q->queue_lock, flags);
+        rq_completed(md, 0);
+}
+EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
+static void __stop_queue(struct request_queue *q)
+{
+        blk_stop_queue(q);
+}
+static void stop_queue(struct request_queue *q)
+{
+        unsigned long flags;
+        spin_lock_irqsave(q->queue_lock, flags);
+        __stop_queue(q);
+        spin_unlock_irqrestore(q->queue_lock, flags);
+}
+static void __start_queue(struct request_queue *q)
+{
+        if (blk_queue_stopped(q))
+                blk_start_queue(q);
+}
+static void start_queue(struct request_queue *q)
+{
+        unsigned long flags;
+        spin_lock_irqsave(q->queue_lock, flags);
+        __start_queue(q);
+        spin_unlock_irqrestore(q->queue_lock, flags);
+}
+/*
+ * Complete the clone and the original request.
+ * Must be called without queue lock.
+ */
+static void dm_end_request(struct request *clone, int error)
+{
+        struct dm_rq_target_io *tio = clone->end_io_data;
+        struct mapped_device *md = tio->md;
+        struct request *rq = tio->orig;
+        if (blk_pc_request(rq)) {
+                rq->errors = clone->errors;
+                rq->resid_len = clone->resid_len;
+                if (rq->sense)
+                        /*
+                         * We are using the sense buffer of the original
+                         * request.
+                         * So setting the length of the sense data is enough.
+                         */
+                        rq->sense_len = clone->sense_len;
+        }
+        BUG_ON(clone->bio);
+        free_rq_tio(tio);
+        blk_end_request_all(rq, error);
+        rq_completed(md, 1);
+}
+/*
+ * Request completion handler for request-based dm
+ */
+static void dm_softirq_done(struct request *rq)
+{
+        struct request *clone = rq->completion_data;
+        struct dm_rq_target_io *tio = clone->end_io_data;
+        dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
+        int error = tio->error;
+        if (!(rq->cmd_flags & REQ_FAILED) && rq_end_io)
+                error = rq_end_io(tio->ti, clone, error, &tio->info);
+        if (error <= 0)
+                /* The target wants to complete the I/O */
+                dm_end_request(clone, error);
+        else if (error == DM_ENDIO_INCOMPLETE)
+                /* The target will handle the I/O */
+                return;
+        else if (error == DM_ENDIO_REQUEUE)
+                /* The target wants to requeue the I/O */
+                dm_requeue_unmapped_request(clone);
+        else {
+                DMWARN("unimplemented target endio return value: %d", error);
+                BUG();
+        }
+}
+/*
+ * Complete the clone and the original request with the error status
+ * through softirq context.
+ */
+static void dm_complete_request(struct request *clone, int error)
+{
+        struct dm_rq_target_io *tio = clone->end_io_data;
+        struct request *rq = tio->orig;
+        tio->error = error;
+        rq->completion_data = clone;
+        blk_complete_request(rq);
+}
+/*
+ * Complete the not-mapped clone and the original request with the error status
+ * through softirq context.
+ * Target's rq_end_io() function isn't called.
+ * This may be used when the target's map_rq() function fails.
+ */
+void dm_kill_unmapped_request(struct request *clone, int error)
+{
+        struct dm_rq_target_io *tio = clone->end_io_data;
+        struct request *rq = tio->orig;
+        rq->cmd_flags |= REQ_FAILED;
+        dm_complete_request(clone, error);
+}
+EXPORT_SYMBOL_GPL(dm_kill_unmapped_request);
+/*
+ * Called with the queue lock held
+ */
+static void end_clone_request(struct request *clone, int error)
+{
+        /*
+         * For just cleaning up the information of the queue in which
+         * the clone was dispatched.
+         * The clone is *NOT* freed actually here because it is alloced from
+         * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.
+         */
+        __blk_put_request(clone->q, clone);
+        /*
+         * Actual request completion is done in a softirq context which doesn't
+         * hold the queue lock.  Otherwise, deadlock could occur because:
+         *     - another request may be submitted by the upper level driver
+         *       of the stacking during the completion
+         *     - the submission which requires queue lock may be done
+         *       against this queue
+         */
+        dm_complete_request(clone, error);
+}
 static sector_t max_io_len(struct mapped_device *md,
                           sector_t sector, struct dm_target *ti)
 {
@@ -636,11 +942,6 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
        sector_t sector;
        struct mapped_device *md;
-        /*
-         * Sanity checks.
-         */
-        BUG_ON(!clone->bi_size);
        clone->bi_end_io = clone_endio;
        clone->bi_private = tio;
@@ -656,8 +957,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
                /* the bio has been remapped so dispatch it */
                trace_block_remap(bdev_get_queue(clone->bi_bdev), clone,
-                                    tio->io->bio->bi_bdev->bd_dev,
+                                    tio->io->bio->bi_bdev->bd_dev, sector);
-                                    clone->bi_sector, sector);
                generic_make_request(clone);
        } else if (r < 0 || r == DM_MAPIO_REQUEUE) {
@@ -755,6 +1055,48 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
        return clone;
 }
+static struct dm_target_io *alloc_tio(struct clone_info *ci,
+                                      struct dm_target *ti)
+{
+        struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO);
+        tio->io = ci->io;
+        tio->ti = ti;
+        memset(&tio->info, 0, sizeof(tio->info));
+        return tio;
+}
+static void __flush_target(struct clone_info *ci, struct dm_target *ti,
+                          unsigned flush_nr)
+{
+        struct dm_target_io *tio = alloc_tio(ci, ti);
+        struct bio *clone;
+        tio->info.flush_request = flush_nr;
+        clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs);
+        __bio_clone(clone, ci->bio);
+        clone->bi_destructor = dm_bio_destructor;
+        __map_bio(ti, clone, tio);
+}
+static int __clone_and_map_empty_barrier(struct clone_info *ci)
+{
+        unsigned target_nr = 0, flush_nr;
+        struct dm_target *ti;
+        while ((ti = dm_table_get_target(ci->map, target_nr++)))
+                for (flush_nr = 0; flush_nr < ti->num_flush_requests;
+                     flush_nr++)
+                        __flush_target(ci, ti, flush_nr);
+        ci->sector_count = 0;
+        return 0;
+}
 static int __clone_and_map(struct clone_info *ci)
 {
        struct bio *clone, *bio = ci->bio;
@@ -762,6 +1104,9 @@ static int __clone_and_map(struct clone_info *ci)
        sector_t len = 0, max;
        struct dm_target_io *tio;
+        if (unlikely(bio_empty_barrier(bio)))
+                return __clone_and_map_empty_barrier(ci);
        ti = dm_table_find_target(ci->map, ci->sector);
        if (!dm_target_is_valid(ti))
                return -EIO;
@@ -771,10 +1116,7 @@ static int __clone_and_map(struct clone_info *ci)
        /*
         * Allocate a target io object.
         */
-        tio = alloc_tio(ci->md);
+        tio = alloc_tio(ci, ti);
-        tio->io = ci->io;
-        tio->ti = ti;
-        memset(&tio->info, 0, sizeof(tio->info));
        if (ci->sector_count <= max) {
                /*
@@ -830,10 +1172,7 @@ static int __clone_and_map(struct clone_info *ci)
                                max = max_io_len(ci->md, ci->sector, ti);
-                                tio = alloc_tio(ci->md);
+                                tio = alloc_tio(ci, ti);
-                                tio->io = ci->io;
-                                tio->ti = ti;
-                                memset(&tio->info, 0, sizeof(tio->info));
                        }
                        len = min(remaining, max);
@@ -868,7 +1207,8 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
                if (!bio_barrier(bio))
                        bio_io_error(bio);
                else
-                        md->barrier_error = -EIO;
+                        if (!md->barrier_error)
+                                md->barrier_error = -EIO;
                return;
        }
@@ -881,6 +1221,8 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
        ci.io->md = md;
        ci.sector = bio->bi_sector;
        ci.sector_count = bio_sectors(bio);
+        if (unlikely(bio_empty_barrier(bio)))
+                ci.sector_count = 1;
        ci.idx = bio->bi_idx;
        start_io_acct(ci.io);
@@ -928,6 +1270,16 @@ static int dm_merge_bvec(struct request_queue *q,
         */
        if (max_size && ti->type->merge)
                max_size = ti->type->merge(ti, bvm, biovec, max_size);
+        /*
+         * If the target doesn't support merge method and some of the devices
+         * provided their merge_bvec method (we know this by looking at
+         * queue_max_hw_sectors), then we can't allow bios with multiple vector
+         * entries.  So always set max_size to 0, and the code below allows
+         * just one page.
+         */
+        else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
+                max_size = 0;
 out_table:
        dm_table_put(map);
@@ -946,7 +1298,7 @@ out:
 * The request function that just remaps the bio built up by
 * dm_merge_bvec.
 */
-static int dm_request(struct request_queue *q, struct bio *bio)
+static int _dm_request(struct request_queue *q, struct bio *bio)
 {
        int rw = bio_data_dir(bio);
        struct mapped_device *md = q->queuedata;
@@ -983,12 +1335,274 @@ static int dm_request(struct request_queue *q, struct bio *bio)
        return 0;
 }
+static int dm_make_request(struct request_queue *q, struct bio *bio)
+{
+        struct mapped_device *md = q->queuedata;
+        if (unlikely(bio_barrier(bio))) {
+                bio_endio(bio, -EOPNOTSUPP);
+                return 0;
+        }
+        return md->saved_make_request_fn(q, bio); /* call __make_request() */
+}
+static int dm_request_based(struct mapped_device *md)
+{
+        return blk_queue_stackable(md->queue);
+}
+static int dm_request(struct request_queue *q, struct bio *bio)
+{
+        struct mapped_device *md = q->queuedata;
+        if (dm_request_based(md))
+                return dm_make_request(q, bio);
+        return _dm_request(q, bio);
+}
+void dm_dispatch_request(struct request *rq)
+{
+        int r;
+        if (blk_queue_io_stat(rq->q))
+                rq->cmd_flags |= REQ_IO_STAT;
+        rq->start_time = jiffies;
+        r = blk_insert_cloned_request(rq->q, rq);
+        if (r)
+                dm_complete_request(rq, r);
+}
+EXPORT_SYMBOL_GPL(dm_dispatch_request);
+static void dm_rq_bio_destructor(struct bio *bio)
+{
+        struct dm_rq_clone_bio_info *info = bio->bi_private;
+        struct mapped_device *md = info->tio->md;
+        free_bio_info(info);
+        bio_free(bio, md->bs);
+}
+static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
+                                 void *data)
+{
+        struct dm_rq_target_io *tio = data;
+        struct mapped_device *md = tio->md;
+        struct dm_rq_clone_bio_info *info = alloc_bio_info(md);
+        if (!info)
+                return -ENOMEM;
+        info->orig = bio_orig;
+        info->tio = tio;
+        bio->bi_end_io = end_clone_bio;
+        bio->bi_private = info;
+        bio->bi_destructor = dm_rq_bio_destructor;
+        return 0;
+}
+static int setup_clone(struct request *clone, struct request *rq,
+                       struct dm_rq_target_io *tio)
+{
+        int r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
+                                  dm_rq_bio_constructor, tio);
+        if (r)
+                return r;
+        clone->cmd = rq->cmd;
+        clone->cmd_len = rq->cmd_len;
+        clone->sense = rq->sense;
+        clone->buffer = rq->buffer;
+        clone->end_io = end_clone_request;
+        clone->end_io_data = tio;
+        return 0;
+}
+static int dm_rq_flush_suspending(struct mapped_device *md)
+{
+        return !md->suspend_rq.special;
+}
+/*
+ * Called with the queue lock held.
+ */
+static int dm_prep_fn(struct request_queue *q, struct request *rq)
+{
+        struct mapped_device *md = q->queuedata;
+        struct dm_rq_target_io *tio;
+        struct request *clone;
+        if (unlikely(rq == &md->suspend_rq)) {
+                if (dm_rq_flush_suspending(md))
+                        return BLKPREP_OK;
+                else
+                        /* The flush suspend was interrupted */
+                        return BLKPREP_KILL;
+        }
+        if (unlikely(rq->special)) {
+                DMWARN("Already has something in rq->special.");
+                return BLKPREP_KILL;
+        }
+        tio = alloc_rq_tio(md); /* Only one for each original request */
+        if (!tio)
+                /* -ENOMEM */
+                return BLKPREP_DEFER;
+        tio->md = md;
+        tio->ti = NULL;
+        tio->orig = rq;
+        tio->error = 0;
+        memset(&tio->info, 0, sizeof(tio->info));
+        clone = &tio->clone;
+        if (setup_clone(clone, rq, tio)) {
+                /* -ENOMEM */
+                free_rq_tio(tio);
+                return BLKPREP_DEFER;
+        }
+        rq->special = clone;
+        rq->cmd_flags |= REQ_DONTPREP;
+        return BLKPREP_OK;
+}
+static void map_request(struct dm_target *ti, struct request *rq,
+                        struct mapped_device *md)
+{
+        int r;
+        struct request *clone = rq->special;
+        struct dm_rq_target_io *tio = clone->end_io_data;
+        /*
+         * Hold the md reference here for the in-flight I/O.
+         * We can't rely on the reference count by device opener,
+         * because the device may be closed during the request completion
+         * when all bios are completed.
+         * See the comment in rq_completed() too.
+         */
+        dm_get(md);
+        tio->ti = ti;
+        r = ti->type->map_rq(ti, clone, &tio->info);
+        switch (r) {
+        case DM_MAPIO_SUBMITTED:
+                /* The target has taken the I/O to submit by itself later */
+                break;
+        case DM_MAPIO_REMAPPED:
+                /* The target has remapped the I/O so dispatch it */
+                dm_dispatch_request(clone);
+                break;
+        case DM_MAPIO_REQUEUE:
+                /* The target wants to requeue the I/O */
+                dm_requeue_unmapped_request(clone);
+                break;
+        default:
+                if (r > 0) {
+                        DMWARN("unimplemented target map return value: %d", r);
+                        BUG();
+                }
+                /* The target wants to complete the I/O */
+                dm_kill_unmapped_request(clone, r);
+                break;
+        }
+}
+/*
+ * q->request_fn for request-based dm.
+ * Called with the queue lock held.
+ */
+static void dm_request_fn(struct request_queue *q)
+{
+        struct mapped_device *md = q->queuedata;
+        struct dm_table *map = dm_get_table(md);
+        struct dm_target *ti;
+        struct request *rq;
+        /*
+         * For noflush suspend, check blk_queue_stopped() to immediately
+         * quit I/O dispatching.
+         */
+        while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) {
+                rq = blk_peek_request(q);
+                if (!rq)
+                        goto plug_and_out;
+                if (unlikely(rq == &md->suspend_rq)) { /* Flush suspend maker */
+                        if (queue_in_flight(q))
+                                /* Not quiet yet.  Wait more */
+                                goto plug_and_out;
+                        /* This device should be quiet now */
+                        __stop_queue(q);
+                        blk_start_request(rq);
+                        __blk_end_request_all(rq, 0);
+                        wake_up(&md->wait);
+                        goto out;
+                }
+                ti = dm_table_find_target(map, blk_rq_pos(rq));
+                if (ti->type->busy && ti->type->busy(ti))
+                        goto plug_and_out;
+                blk_start_request(rq);
+                spin_unlock(q->queue_lock);
+                map_request(ti, rq, md);
+                spin_lock_irq(q->queue_lock);
+        }
+        goto out;
+plug_and_out:
+        if (!elv_queue_empty(q))
+                /* Some requests still remain, retry later */
+                blk_plug_device(q);
+out:
+        dm_table_put(map);
+        return;
+}
+int dm_underlying_device_busy(struct request_queue *q)
+{
+        return blk_lld_busy(q);
+}
+EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
+static int dm_lld_busy(struct request_queue *q)
+{
+        int r;
+        struct mapped_device *md = q->queuedata;
+        struct dm_table *map = dm_get_table(md);
+        if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
+                r = 1;
+        else
+                r = dm_table_any_busy_target(map);
+        dm_table_put(map);
+        return r;
+}
 static void dm_unplug_all(struct request_queue *q)
 {
        struct mapped_device *md = q->queuedata;
        struct dm_table *map = dm_get_table(md);
        if (map) {
+                if (dm_request_based(md))
+                        generic_unplug_device(q);
                dm_table_unplug_all(map);
                dm_table_put(map);
        }
@@ -1003,7 +1617,16 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
        if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
                map = dm_get_table(md);
                if (map) {
-                        r = dm_table_any_congested(map, bdi_bits);
+                        /*
+                         * Request-based dm cares about only own queue for
+                         * the query about congestion status of request_queue
+                         */
+                        if (dm_request_based(md))
+                                r = md->queue->backing_dev_info.state &
+                                    bdi_bits;
+                        else
+                                r = dm_table_any_congested(map, bdi_bits);
                        dm_table_put(map);
                }
        }
@@ -1126,30 +1749,32 @@ static struct mapped_device *alloc_dev(int minor)
        INIT_LIST_HEAD(&md->uevent_list);
        spin_lock_init(&md->uevent_lock);
-        md->queue = blk_alloc_queue(GFP_KERNEL);
+        md->queue = blk_init_queue(dm_request_fn, NULL);
        if (!md->queue)
                goto bad_queue;
+        /*
+         * Request-based dm devices cannot be stacked on top of bio-based dm
+         * devices.  The type of this dm device has not been decided yet,
+         * although we initialized the queue using blk_init_queue().
+         * The type is decided at the first table loading time.
+         * To prevent problematic device stacking, clear the queue flag
+         * for request stacking support until then.
+         *
+         * This queue is new, so no concurrency on the queue_flags.
+         */
+        queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
+        md->saved_make_request_fn = md->queue->make_request_fn;
        md->queue->queuedata = md;
        md->queue->backing_dev_info.congested_fn = dm_any_congested;
        md->queue->backing_dev_info.congested_data = md;
        blk_queue_make_request(md->queue, dm_request);
-        blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL);
        blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
        md->queue->unplug_fn = dm_unplug_all;
        blk_queue_merge_bvec(md->queue, dm_merge_bvec);
+        blk_queue_softirq_done(md->queue, dm_softirq_done);
-        md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache);
+        blk_queue_prep_rq(md->queue, dm_prep_fn);
-        if (!md->io_pool)
+        blk_queue_lld_busy(md->queue, dm_lld_busy);
-                goto bad_io_pool;
-        md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache);
-        if (!md->tio_pool)
-                goto bad_tio_pool;
-        md->bs = bioset_create(16, 0);
-        if (!md->bs)
-                goto bad_no_bioset;
        md->disk = alloc_disk(1);
        if (!md->disk)
@@ -1173,6 +1798,10 @@ static struct mapped_device *alloc_dev(int minor)
        if (!md->wq)
                goto bad_thread;
+        md->bdev = bdget_disk(md->disk, 0);
+        if (!md->bdev)
+                goto bad_bdev;
        /* Populate the mapping, nobody knows we exist yet */
        spin_lock(&_minor_lock);
        old_md = idr_replace(&_minor_idr, md, minor);
@@ -1182,15 +1811,11 @@ static struct mapped_device *alloc_dev(int minor)
        return md;
+bad_bdev:
+        destroy_workqueue(md->wq);
 bad_thread:
        put_disk(md->disk);
 bad_disk:
-        bioset_free(md->bs);
-bad_no_bioset:
-        mempool_destroy(md->tio_pool);
-bad_tio_pool:
-        mempool_destroy(md->io_pool);
-bad_io_pool:
        blk_cleanup_queue(md->queue);
 bad_queue:
        free_minor(minor);
@@ -1207,14 +1832,15 @@ static void free_dev(struct mapped_device *md)
 {
        int minor = MINOR(disk_devt(md->disk));
-        if (md->suspended_bdev) {
+        unlock_fs(md);
-                unlock_fs(md);
+        bdput(md->bdev);
-                bdput(md->suspended_bdev);
-        }
        destroy_workqueue(md->wq);
-        mempool_destroy(md->tio_pool);
+        if (md->tio_pool)
-        mempool_destroy(md->io_pool);
+                mempool_destroy(md->tio_pool);
-        bioset_free(md->bs);
+        if (md->io_pool)
+                mempool_destroy(md->io_pool);
+        if (md->bs)
+                bioset_free(md->bs);
        blk_integrity_unregister(md->disk);
        del_gendisk(md->disk);
        free_minor(minor);
@@ -1229,6 +1855,29 @@ static void free_dev(struct mapped_device *md)
        kfree(md);
 }
+static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
+{
+        struct dm_md_mempools *p;
+        if (md->io_pool && md->tio_pool && md->bs)
+                /* the md already has necessary mempools */
+                goto out;
+        p = dm_table_get_md_mempools(t);
+        BUG_ON(!p || md->io_pool || md->tio_pool || md->bs);
+        md->io_pool = p->io_pool;
+        p->io_pool = NULL;
+        md->tio_pool = p->tio_pool;
+        p->tio_pool = NULL;
+        md->bs = p->bs;
+        p->bs = NULL;
+out:
+        /* mempool bind completed, now no need any mempools in the table */
+        dm_table_free_md_mempools(t);
+}
 /*
 * Bind a table to the device.
 */
@@ -1252,15 +1901,17 @@ static void __set_size(struct mapped_device *md, sector_t size)
 {
        set_capacity(md->disk, size);
-        mutex_lock(&md->suspended_bdev->bd_inode->i_mutex);
+        mutex_lock(&md->bdev->bd_inode->i_mutex);
-        i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
+        i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
-        mutex_unlock(&md->suspended_bdev->bd_inode->i_mutex);
+        mutex_unlock(&md->bdev->bd_inode->i_mutex);
 }
-static int __bind(struct mapped_device *md, struct dm_table *t)
+static int __bind(struct mapped_device *md, struct dm_table *t,
+                  struct queue_limits *limits)
 {
        struct request_queue *q = md->queue;
        sector_t size;
+        unsigned long flags;
        size = dm_table_get_size(t);
@@ -1270,8 +1921,7 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
        if (size != get_capacity(md->disk))
                memset(&md->geometry, 0, sizeof(md->geometry));
-        if (md->suspended_bdev)
+        __set_size(md, size);
-                __set_size(md, size);
        if (!size) {
                dm_table_destroy(t);
@@ -1280,10 +1930,22 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
        dm_table_event_callback(t, event_callback, md);
-        write_lock(&md->map_lock);
+        /*
+         * The queue hasn't been stopped yet, if the old table type wasn't
+         * for request-based during suspension.  So stop it to prevent
+         * I/O mapping before resume.
+         * This must be done before setting the queue restrictions,
+         * because request-based dm may be run just after the setting.
+         */
+        if (dm_table_request_based(t) && !blk_queue_stopped(q))
+                stop_queue(q);
+        __bind_mempools(md, t);
+        write_lock_irqsave(&md->map_lock, flags);
        md->map = t;
-        dm_table_set_restrictions(t, q);
+        dm_table_set_restrictions(t, q, limits);
-        write_unlock(&md->map_lock);
+        write_unlock_irqrestore(&md->map_lock, flags);
        return 0;
 }
@@ -1291,14 +1953,15 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
 static void __unbind(struct mapped_device *md)
 {
        struct dm_table *map = md->map;
+        unsigned long flags;
        if (!map)
                return;
        dm_table_event_callback(map, NULL, NULL);
-        write_lock(&md->map_lock);
+        write_lock_irqsave(&md->map_lock, flags);
        md->map = NULL;
-        write_unlock(&md->map_lock);
+        write_unlock_irqrestore(&md->map_lock, flags);
        dm_table_destroy(map);
 }
@@ -1402,6 +2065,8 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 {
        int r = 0;
        DECLARE_WAITQUEUE(wait, current);
+        struct request_queue *q = md->queue;
+        unsigned long flags;
        dm_unplug_all(md->queue);
@@ -1411,7 +2076,14 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
                set_current_state(interruptible);
                smp_mb();
-                if (!atomic_read(&md->pending))
+                if (dm_request_based(md)) {
+                        spin_lock_irqsave(q->queue_lock, flags);
+                        if (!queue_in_flight(q) && blk_queue_stopped(q)) {
+                                spin_unlock_irqrestore(q->queue_lock, flags);
+                                break;
+                        }
+                        spin_unlock_irqrestore(q->queue_lock, flags);
+                } else if (!atomic_read(&md->pending))
                        break;
                if (interruptible == TASK_INTERRUPTIBLE &&
@@ -1429,34 +2101,36 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
        return r;
 }
-static int dm_flush(struct mapped_device *md)
+static void dm_flush(struct mapped_device *md)
 {
        dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
-        return 0;
+        bio_init(&md->barrier_bio);
+        md->barrier_bio.bi_bdev = md->bdev;
+        md->barrier_bio.bi_rw = WRITE_BARRIER;
+        __split_and_process_bio(md, &md->barrier_bio);
+        dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
 }
 static void process_barrier(struct mapped_device *md, struct bio *bio)
 {
-        int error = dm_flush(md);
+        md->barrier_error = 0;
-        if (unlikely(error)) {
+        dm_flush(md);
-                bio_endio(bio, error);
-                return;
-        }
-        if (bio_empty_barrier(bio)) {
-                bio_endio(bio, 0);
-                return;
-        }
-        __split_and_process_bio(md, bio);
-        error = dm_flush(md);
+        if (!bio_empty_barrier(bio)) {
+                __split_and_process_bio(md, bio);
-        if (!error && md->barrier_error)
+                dm_flush(md);
-                error = md->barrier_error;
+        }
        if (md->barrier_error != DM_ENDIO_REQUEUE)
-                bio_endio(bio, error);
+                bio_endio(bio, md->barrier_error);
+        else {
+                spin_lock_irq(&md->deferred_lock);
+                bio_list_add_head(&md->deferred, bio);
+                spin_unlock_irq(&md->deferred_lock);
+        }
 }
 /*
@@ -1482,10 +2156,14 @@ static void dm_wq_work(struct work_struct *work)
                up_write(&md->io_lock);
-                if (bio_barrier(c))
+                if (dm_request_based(md))
-                        process_barrier(md, c);
+                        generic_make_request(c);
-                else
+                else {
-                        __split_and_process_bio(md, c);
+                        if (bio_barrier(c))
+                                process_barrier(md, c);
+                        else
+                                __split_and_process_bio(md, c);
+                }
                down_write(&md->io_lock);
        }
@@ -1505,6 +2183,7 @@ static void dm_queue_flush(struct mapped_device *md)
 */
 int dm_swap_table(struct mapped_device *md, struct dm_table *table)
 {
+        struct queue_limits limits;
        int r = -EINVAL;
        mutex_lock(&md->suspend_lock);
@@ -1513,19 +2192,96 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table)
        if (!dm_suspended(md))
                goto out;
-        /* without bdev, the device size cannot be changed */
+        r = dm_calculate_queue_limits(table, &limits);
-        if (!md->suspended_bdev)
+        if (r)
-                if (get_capacity(md->disk) != dm_table_get_size(table))
+                goto out;
-                        goto out;
+        /* cannot change the device type, once a table is bound */
+        if (md->map &&
+            (dm_table_get_type(md->map) != dm_table_get_type(table))) {
+                DMWARN("can't change the device type after a table is bound");
+                goto out;
+        }
+        /*
+         * It is enought that blk_queue_ordered() is called only once when
+         * the first bio-based table is bound.
+         *
+         * This setting should be moved to alloc_dev() when request-based dm
+         * supports barrier.
+         */
+        if (!md->map && dm_table_bio_based(table))
+                blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL);
        __unbind(md);
-        r = __bind(md, table);
+        r = __bind(md, table, &limits);
 out:
        mutex_unlock(&md->suspend_lock);
        return r;
 }
+static void dm_rq_invalidate_suspend_marker(struct mapped_device *md)
+{
+        md->suspend_rq.special = (void *)0x1;
+}
+static void dm_rq_abort_suspend(struct mapped_device *md, int noflush)
+{
+        struct request_queue *q = md->queue;
+        unsigned long flags;
+        spin_lock_irqsave(q->queue_lock, flags);
+        if (!noflush)
+                dm_rq_invalidate_suspend_marker(md);
+        __start_queue(q);
+        spin_unlock_irqrestore(q->queue_lock, flags);
+}
+static void dm_rq_start_suspend(struct mapped_device *md, int noflush)
+{
+        struct request *rq = &md->suspend_rq;
+        struct request_queue *q = md->queue;
+        if (noflush)
+                stop_queue(q);
+        else {
+                blk_rq_init(q, rq);
+                blk_insert_request(q, rq, 0, NULL);
+        }
+}
+static int dm_rq_suspend_available(struct mapped_device *md, int noflush)
+{
+        int r = 1;
+        struct request *rq = &md->suspend_rq;
+        struct request_queue *q = md->queue;
+        unsigned long flags;
+        if (noflush)
+                return r;
+        /* The marker must be protected by queue lock if it is in use */
+        spin_lock_irqsave(q->queue_lock, flags);
+        if (unlikely(rq->ref_count)) {
+                /*
+                 * This can happen, when the previous flush suspend was
+                 * interrupted, the marker is still in the queue and
+                 * this flush suspend has been invoked, because we don't
+                 * remove the marker at the time of suspend interruption.
+                 * We have only one marker per mapped_device, so we can't
+                 * start another flush suspend while it is in use.
+                 */
+                BUG_ON(!rq->special); /* The marker should be invalidated */
+                DMWARN("Invalidating the previous flush suspend is still in"
+                       " progress.  Please retry later.");
+                r = 0;
+        }
+        spin_unlock_irqrestore(q->queue_lock, flags);
+        return r;
+}
 /*
 * Functions to lock and unlock any filesystem running on the
 * device.
@@ -1536,7 +2292,7 @@ static int lock_fs(struct mapped_device *md)
        WARN_ON(md->frozen_sb);
-        md->frozen_sb = freeze_bdev(md->suspended_bdev);
+        md->frozen_sb = freeze_bdev(md->bdev);
        if (IS_ERR(md->frozen_sb)) {
                r = PTR_ERR(md->frozen_sb);
                md->frozen_sb = NULL;
@@ -1545,9 +2301,6 @@ static int lock_fs(struct mapped_device *md)
        set_bit(DMF_FROZEN, &md->flags);
-        /* don't bdput right now, we don't want the bdev
-         * to go away while it is locked.
-         */
        return 0;
 }
@@ -1556,7 +2309,7 @@ static void unlock_fs(struct mapped_device *md)
        if (!test_bit(DMF_FROZEN, &md->flags))
                return;
-        thaw_bdev(md->suspended_bdev, md->frozen_sb);
+        thaw_bdev(md->bdev, md->frozen_sb);
        md->frozen_sb = NULL;
        clear_bit(DMF_FROZEN, &md->flags);
 }
@@ -1568,6 +2321,53 @@ static void unlock_fs(struct mapped_device *md)
 * dm_bind_table, dm_suspend must be called to flush any in
 * flight bios and ensure that any further io gets deferred.
 */
+/*
+ * Suspend mechanism in request-based dm.
+ *
+ * After the suspend starts, further incoming requests are kept in
+ * the request_queue and deferred.
+ * Remaining requests in the request_queue at the start of suspend are flushed
+ * if it is flush suspend.
+ * The suspend completes when the following conditions have been satisfied,
+ * so wait for it:
+ *    1. q->in_flight is 0 (which means no in_flight request)
+ *    2. queue has been stopped (which means no request dispatching)
+ *
+ *
+ * Noflush suspend
+ * ---------------
+ * Noflush suspend doesn't need to dispatch remaining requests.
+ * So stop the queue immediately.  Then, wait for all in_flight requests
+ * to be completed or requeued.
+ *
+ * To abort noflush suspend, start the queue.
+ *
+ *
+ * Flush suspend
+ * -------------
+ * Flush suspend needs to dispatch remaining requests.  So stop the queue
+ * after the remaining requests are completed. (Requeued request must be also
+ * re-dispatched and completed.  Until then, we can't stop the queue.)
+ *
+ * During flushing the remaining requests, further incoming requests are also
+ * inserted to the same queue.  To distinguish which requests are to be
+ * flushed, we insert a marker request to the queue at the time of starting
+ * flush suspend, like a barrier.
+ * The dispatching is blocked when the marker is found on the top of the queue.
+ * And the queue is stopped when all in_flight requests are completed, since
+ * that means the remaining requests are completely flushed.
+ * Then, the marker is removed from the queue.
+ *
+ * To abort flush suspend, we also need to take care of the marker, not only
+ * starting the queue.
+ * We don't remove the marker forcibly from the queue since it's against
+ * the block-layer manner.  Instead, we put a invalidated mark on the marker.
+ * When the invalidated marker is found on the top of the queue, it is
+ * immediately removed from the queue, so it doesn't block dispatching.
+ * Because we have only one marker per mapped_device, we can't start another
+ * flush suspend until the invalidated marker is removed from the queue.
+ * So fail and return with -EBUSY in such a case.
+ */
 int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 {
        struct dm_table *map = NULL;
@@ -1582,6 +2382,11 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
                goto out_unlock;
        }
+        if (dm_request_based(md) && !dm_rq_suspend_available(md, noflush)) {
+                r = -EBUSY;
+                goto out_unlock;
+        }
        map = dm_get_table(md);
        /*
@@ -1594,24 +2399,14 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
        /* This does not get reverted if there's an error later. */
        dm_table_presuspend_targets(map);
-        /* bdget() can stall if the pending I/Os are not flushed */
+        /*
-        if (!noflush) {
+         * Flush I/O to the device. noflush supersedes do_lockfs,
-                md->suspended_bdev = bdget_disk(md->disk, 0);
+         * because lock_fs() needs to flush I/Os.
-                if (!md->suspended_bdev) {
+         */
-                        DMWARN("bdget failed in dm_suspend");
+        if (!noflush && do_lockfs) {
-                        r = -ENOMEM;
+                r = lock_fs(md);
+                if (r)
                        goto out;
-                }
-                /*
-                 * Flush I/O to the device. noflush supersedes do_lockfs,
-                 * because lock_fs() needs to flush I/Os.
-                 */
-                if (do_lockfs) {
-                        r = lock_fs(md);
-                        if (r)
-                                goto out;
-                }
        }
        /*
@@ -1637,6 +2432,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
        flush_workqueue(md->wq);
+        if (dm_request_based(md))
+                dm_rq_start_suspend(md, noflush);
        /*
         * At this point no more requests are entering target request routines.
         * We call dm_wait_for_completion to wait for all existing requests
@@ -1653,6 +2451,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
        if (r < 0) {
                dm_queue_flush(md);
+                if (dm_request_based(md))
+                        dm_rq_abort_suspend(md, noflush);
                unlock_fs(md);
                goto out; /* pushback list is already flushed, so skip flush */
        }
@@ -1668,11 +2469,6 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
        set_bit(DMF_SUSPENDED, &md->flags);
 out:
-        if (r && md->suspended_bdev) {
-                bdput(md->suspended_bdev);
-                md->suspended_bdev = NULL;
-        }
        dm_table_put(map);
 out_unlock:
@@ -1699,21 +2495,20 @@ int dm_resume(struct mapped_device *md)
        dm_queue_flush(md);
-        unlock_fs(md);
+        /*
+         * Flushing deferred I/Os must be done after targets are resumed
+         * so that mapping of targets can work correctly.
+         * Request-based dm is queueing the deferred I/Os in its request_queue.
+         */
+        if (dm_request_based(md))
+                start_queue(md->queue);
-        if (md->suspended_bdev) {
+        unlock_fs(md);
-                bdput(md->suspended_bdev);
-                md->suspended_bdev = NULL;
-        }
        clear_bit(DMF_SUSPENDED, &md->flags);
        dm_table_unplug_all(map);
-        dm_kobject_uevent(md);
        r = 0;
 out:
        dm_table_put(map);
        mutex_unlock(&md->suspend_lock);
@@ -1724,9 +2519,19 @@ out:
 /*-----------------------------------------------------------------
 * Event notification.
 *---------------------------------------------------------------*/
-void dm_kobject_uevent(struct mapped_device *md)
+void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
-{
+                       unsigned cookie)
-        kobject_uevent(&disk_to_dev(md->disk)->kobj, KOBJ_CHANGE);
+{
+        char udev_cookie[DM_COOKIE_LENGTH];
+        char *envp[] = { udev_cookie, NULL };
+        if (!cookie)
+                kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
+        else {
+                snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
+                         DM_COOKIE_ENV_VAR_NAME, cookie);
+                kobject_uevent_env(&disk_to_dev(md->disk)->kobj, action, envp);
+        }
 }
 uint32_t dm_next_uevent_seq(struct mapped_device *md)
@@ -1780,6 +2585,10 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
        if (&md->kobj != kobj)
                return NULL;
+        if (test_bit(DMF_FREEING, &md->flags) ||
+            test_bit(DMF_DELETING, &md->flags))
+                return NULL;
        dm_get(md);
        return md;
 }
@@ -1800,6 +2609,61 @@ int dm_noflush_suspending(struct dm_target *ti)
 }
 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
+struct dm_md_mempools *dm_alloc_md_mempools(unsigned type)
+{
+        struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL);
+        if (!pools)
+                return NULL;
+        pools->io_pool = (type == DM_TYPE_BIO_BASED) ?
+                         mempool_create_slab_pool(MIN_IOS, _io_cache) :
+                         mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache);
+        if (!pools->io_pool)
+                goto free_pools_and_out;
+        pools->tio_pool = (type == DM_TYPE_BIO_BASED) ?
+                          mempool_create_slab_pool(MIN_IOS, _tio_cache) :
+                          mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
+        if (!pools->tio_pool)
+                goto free_io_pool_and_out;
+        pools->bs = (type == DM_TYPE_BIO_BASED) ?
+                    bioset_create(16, 0) : bioset_create(MIN_IOS, 0);
+        if (!pools->bs)
+                goto free_tio_pool_and_out;
+        return pools;
+free_tio_pool_and_out:
+        mempool_destroy(pools->tio_pool);
+free_io_pool_and_out:
+        mempool_destroy(pools->io_pool);
+free_pools_and_out:
+        kfree(pools);
+        return NULL;
+}
+void dm_free_md_mempools(struct dm_md_mempools *pools)
+{
+        if (!pools)
+                return;
+        if (pools->io_pool)
+                mempool_destroy(pools->io_pool);
+        if (pools->tio_pool)
+                mempool_destroy(pools->tio_pool);
+        if (pools->bs)
+                bioset_free(pools->bs);
+        kfree(pools);
+}
 static struct block_device_operations dm_blk_dops = {
        .open = dm_blk_open,
        .release = dm_blk_close,
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index a31506d93e91..23278ae80f08 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -23,6 +23,13 @@
 #define DM_SUSPEND_NOFLUSH_FLAG         (1 << 1)
 /*
+ * Type of table and mapped_device's mempool
+ */
+#define DM_TYPE_NONE            0
+#define DM_TYPE_BIO_BASED       1
+#define DM_TYPE_REQUEST_BASED   2
+/*
 * List of devices that a metadevice uses and should open/close.
 */
 struct dm_dev_internal {
@@ -32,6 +39,7 @@ struct dm_dev_internal {
 };
 struct dm_table;
+struct dm_md_mempools;
 /*-----------------------------------------------------------------
 * Internal table functions.
@@ -41,18 +49,34 @@ void dm_table_event_callback(struct dm_table *t,
                             void (*fn)(void *), void *context);
 struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
 struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
-void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q);
+int dm_calculate_queue_limits(struct dm_table *table,
+                              struct queue_limits *limits);
+void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
+                               struct queue_limits *limits);
 struct list_head *dm_table_get_devices(struct dm_table *t);
 void dm_table_presuspend_targets(struct dm_table *t);
 void dm_table_postsuspend_targets(struct dm_table *t);
 int dm_table_resume_targets(struct dm_table *t);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
+int dm_table_any_busy_target(struct dm_table *t);
+int dm_table_set_type(struct dm_table *t);
+unsigned dm_table_get_type(struct dm_table *t);
+bool dm_table_bio_based(struct dm_table *t);
+bool dm_table_request_based(struct dm_table *t);
+int dm_table_alloc_md_mempools(struct dm_table *t);
+void dm_table_free_md_mempools(struct dm_table *t);
+struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
 /*
 * To check the return value from dm_table_find_target().
 */
 #define dm_target_is_valid(t) ((t)->table)
+/*
+ * To check whether the target type is request-based or not (bio-based).
+ */
+#define dm_target_request_based(t) ((t)->type->map_rq != NULL)
 /*-----------------------------------------------------------------
 * A registry of target types.
 *---------------------------------------------------------------*/
@@ -92,9 +116,16 @@ void dm_stripe_exit(void);
 int dm_open_count(struct mapped_device *md);
 int dm_lock_for_deletion(struct mapped_device *md);
-void dm_kobject_uevent(struct mapped_device *md);
+void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
+                       unsigned cookie);
 int dm_kcopyd_init(void);
 void dm_kcopyd_exit(void);
+/*
+ * Mempool operations
+ */
+struct dm_md_mempools *dm_alloc_md_mempools(unsigned type);
+void dm_free_md_mempools(struct dm_md_mempools *pools);
 #endif
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 8695809b24b0..87d88dbb667f 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -255,14 +255,14 @@ static void status(struct seq_file *seq, mddev_t *mddev)
 }
-static int reconfig(mddev_t *mddev, int layout, int chunk_size)
+static int reshape(mddev_t *mddev)
 {
-        int mode = layout & ModeMask;
+        int mode = mddev->new_layout & ModeMask;
-        int count = layout >> ModeShift;
+        int count = mddev->new_layout >> ModeShift;
        conf_t *conf = mddev->private;
-        if (chunk_size != -1)
+        if (mddev->new_layout < 0)
-                return -EINVAL;
+                return 0;
        /* new layout */
        if (mode == ClearFaults)
@@ -279,6 +279,7 @@ static int reconfig(mddev_t *mddev, int layout, int chunk_size)
                atomic_set(&conf->counters[mode], count);
        } else
                return -EINVAL;
+        mddev->new_layout = -1;
        mddev->layout = -1; /* makes sure further changes come through */
        return 0;
 }
@@ -298,8 +299,12 @@ static int run(mddev_t *mddev)
 {
        mdk_rdev_t *rdev;
        int i;
+        conf_t *conf;
+        if (md_check_no_bitmap(mddev))
+                return -EINVAL;
-        conf_t *conf = kmalloc(sizeof(*conf), GFP_KERNEL);
+        conf = kmalloc(sizeof(*conf), GFP_KERNEL);
        if (!conf)
                return -ENOMEM;
@@ -315,7 +320,7 @@ static int run(mddev_t *mddev)
        md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
        mddev->private = conf;
-        reconfig(mddev, mddev->layout, -1);
+        reshape(mddev);
        return 0;
 }
@@ -338,7 +343,7 @@ static struct mdk_personality faulty_personality =
        .run            = run,
        .stop           = stop,
        .status         = status,
-        .reconfig       = reconfig,
+        .check_reshape  = reshape,
        .size           = faulty_size,
 };
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 7a36e38393a1..15c8b7b25a9b 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -27,19 +27,27 @@
 */
 static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector)
 {
-        dev_info_t *hash;
+        int lo, mid, hi;
-        linear_conf_t *conf = mddev_to_conf(mddev);
+        linear_conf_t *conf;
-        sector_t idx = sector >> conf->sector_shift;
+        lo = 0;
+        hi = mddev->raid_disks - 1;
+        conf = rcu_dereference(mddev->private);
        /*
-         * sector_div(a,b) returns the remainer and sets a to a/b
+         * Binary Search
         */
-        (void)sector_div(idx, conf->spacing);
-        hash = conf->hash_table[idx];
-        while (sector >= hash->num_sectors + hash->start_sector)
+        while (hi > lo) {
-                hash++;
-        return hash;
+                mid = (hi + lo) / 2;
+                if (sector < conf->disks[mid].end_sector)
+                        hi = mid;
+                else
+                        lo = mid + 1;
+        }
+        return conf->disks + lo;
 }
 /**
@@ -59,8 +67,10 @@ static int linear_mergeable_bvec(struct request_queue *q,
        unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9;
        sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
+        rcu_read_lock();
        dev0 = which_dev(mddev, sector);
-        maxsectors = dev0->num_sectors - (sector - dev0->start_sector);
+        maxsectors = dev0->end_sector - sector;
+        rcu_read_unlock();
        if (maxsectors < bio_sectors)
                maxsectors = 0;
@@ -79,46 +89,57 @@ static int linear_mergeable_bvec(struct request_queue *q,
 static void linear_unplug(struct request_queue *q)
 {
        mddev_t *mddev = q->queuedata;
-        linear_conf_t *conf = mddev_to_conf(mddev);
+        linear_conf_t *conf;
        int i;
+        rcu_read_lock();
+        conf = rcu_dereference(mddev->private);
        for (i=0; i < mddev->raid_disks; i++) {
                struct request_queue *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev);
                blk_unplug(r_queue);
        }
+        rcu_read_unlock();
 }
 static int linear_congested(void *data, int bits)
 {
        mddev_t *mddev = data;
-        linear_conf_t *conf = mddev_to_conf(mddev);
+        linear_conf_t *conf;
        int i, ret = 0;
+        rcu_read_lock();
+        conf = rcu_dereference(mddev->private);
        for (i = 0; i < mddev->raid_disks && !ret ; i++) {
                struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev);
                ret |= bdi_congested(&q->backing_dev_info, bits);
        }
+        rcu_read_unlock();
        return ret;
 }
 static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 {
-        linear_conf_t *conf = mddev_to_conf(mddev);
+        linear_conf_t *conf;
+        sector_t array_sectors;
+        rcu_read_lock();
+        conf = rcu_dereference(mddev->private);
        WARN_ONCE(sectors || raid_disks,
                  "%s does not support generic reshape\n", __func__);
+        array_sectors = conf->array_sectors;
+        rcu_read_unlock();
-        return conf->array_sectors;
+        return array_sectors;
 }
 static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 {
        linear_conf_t *conf;
-        dev_info_t **table;
        mdk_rdev_t *rdev;
-        int i, nb_zone, cnt;
+        int i, cnt;
-        sector_t min_sectors;
-        sector_t curr_sector;
        conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(dev_info_t),
                        GFP_KERNEL);
@@ -131,6 +152,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
        list_for_each_entry(rdev, &mddev->disks, same_set) {
                int j = rdev->raid_disk;
                dev_info_t *disk = conf->disks + j;
+                sector_t sectors;
                if (j < 0 || j >= raid_disks || disk->rdev) {
                        printk("linear: disk numbering problem. Aborting!\n");
@@ -138,6 +160,11 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
                }
                disk->rdev = rdev;
+                if (mddev->chunk_sectors) {
+                        sectors = rdev->sectors;
+                        sector_div(sectors, mddev->chunk_sectors);
+                        rdev->sectors = sectors * mddev->chunk_sectors;
+                }
                blk_queue_stack_limits(mddev->queue,
                                       rdev->bdev->bd_disk->queue);
@@ -146,105 +173,27 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
                 * a one page request is never in violation.
                 */
                if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-                    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+                    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
                        blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
-                disk->num_sectors = rdev->sectors;
                conf->array_sectors += rdev->sectors;
                cnt++;
        }
        if (cnt != raid_disks) {
                printk("linear: not enough drives present. Aborting!\n");
                goto out;
        }
-        min_sectors = conf->array_sectors;
-        sector_div(min_sectors, PAGE_SIZE/sizeof(struct dev_info *));
-        if (min_sectors == 0)
-                min_sectors = 1;
-        /* min_sectors is the minimum spacing that will fit the hash
-         * table in one PAGE.  This may be much smaller than needed.
-         * We find the smallest non-terminal set of consecutive devices
-         * that is larger than min_sectors and use the size of that as
-         * the actual spacing
-         */
-        conf->spacing = conf->array_sectors;
-        for (i=0; i < cnt-1 ; i++) {
-                sector_t tmp = 0;
-                int j;
-                for (j = i; j < cnt - 1 && tmp < min_sectors; j++)
-                        tmp += conf->disks[j].num_sectors;
-                if (tmp >= min_sectors && tmp < conf->spacing)
-                        conf->spacing = tmp;
-        }
-        /* spacing may be too large for sector_div to work with,
-         * so we might need to pre-shift
-         */
-        conf->sector_shift = 0;
-        if (sizeof(sector_t) > sizeof(u32)) {
-                sector_t space = conf->spacing;
-                while (space > (sector_t)(~(u32)0)) {
-                        space >>= 1;
-                        conf->sector_shift++;
-                }
-        }
        /*
-         * This code was restructured to work around a gcc-2.95.3 internal
+         * Here we calculate the device offsets.
-         * compiler error.  Alter it with care.
         */
-        {
+        conf->disks[0].end_sector = conf->disks[0].rdev->sectors;
-                sector_t sz;
-                unsigned round;
-                unsigned long base;
-                sz = conf->array_sectors >> conf->sector_shift;
-                sz += 1; /* force round-up */
-                base = conf->spacing >> conf->sector_shift;
-                round = sector_div(sz, base);
-                nb_zone = sz + (round ? 1 : 0);
-        }
-        BUG_ON(nb_zone > PAGE_SIZE / sizeof(struct dev_info *));
-        conf->hash_table = kmalloc (sizeof (struct dev_info *) * nb_zone,
-                                        GFP_KERNEL);
-        if (!conf->hash_table)
-                goto out;
-        /*
-         * Here we generate the linear hash table
-         * First calculate the device offsets.
-         */
-        conf->disks[0].start_sector = 0;
        for (i = 1; i < raid_disks; i++)
-                conf->disks[i].start_sector =
+                conf->disks[i].end_sector =
-                        conf->disks[i-1].start_sector +
+                        conf->disks[i-1].end_sector +
-                        conf->disks[i-1].num_sectors;
+                        conf->disks[i].rdev->sectors;
-        table = conf->hash_table;
-        i = 0;
-        for (curr_sector = 0;
-             curr_sector < conf->array_sectors;
-             curr_sector += conf->spacing) {
-                while (i < raid_disks-1 &&
-                       curr_sector >= conf->disks[i+1].start_sector)
-                        i++;
-                *table ++ = conf->disks + i;
-        }
-        if (conf->sector_shift) {
-                conf->spacing >>= conf->sector_shift;
-                /* round spacing up so that when we divide by it,
-                 * we err on the side of "too-low", which is safest.
-                 */
-                conf->spacing++;
-        }
-        BUG_ON(table - conf->hash_table > nb_zone);
        return conf;
@@ -257,6 +206,8 @@ static int linear_run (mddev_t *mddev)
 {
        linear_conf_t *conf;
+        if (md_check_no_bitmap(mddev))
+                return -EINVAL;
        mddev->queue->queue_lock = &mddev->queue->__queue_lock;
        conf = linear_conf(mddev, mddev->raid_disks);
@@ -272,6 +223,12 @@ static int linear_run (mddev_t *mddev)
        return 0;
 }
+static void free_conf(struct rcu_head *head)
+{
+        linear_conf_t *conf = container_of(head, linear_conf_t, rcu);
+        kfree(conf);
+}
 static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
 {
        /* Adding a drive to a linear array allows the array to grow.
@@ -282,7 +239,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
         * The current one is never freed until the array is stopped.
         * This avoids races.
         */
-        linear_conf_t *newconf;
+        linear_conf_t *newconf, *oldconf;
        if (rdev->saved_raid_disk != mddev->raid_disks)
                return -EINVAL;
@@ -294,25 +251,29 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
        if (!newconf)
                return -ENOMEM;
-        newconf->prev = mddev_to_conf(mddev);
+        oldconf = rcu_dereference(mddev->private);
-        mddev->private = newconf;
        mddev->raid_disks++;
+        rcu_assign_pointer(mddev->private, newconf);
        md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
        set_capacity(mddev->gendisk, mddev->array_sectors);
+        call_rcu(&oldconf->rcu, free_conf);
        return 0;
 }
 static int linear_stop (mddev_t *mddev)
 {
-        linear_conf_t *conf = mddev_to_conf(mddev);
+        linear_conf_t *conf = mddev->private;
-  
+        /*
+         * We do not require rcu protection here since
+         * we hold reconfig_mutex for both linear_add and
+         * linear_stop, so they cannot race.
+         * We should make sure any old 'conf's are properly
+         * freed though.
+         */
+        rcu_barrier();
        blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
-        do {
+        kfree(conf);
-                linear_conf_t *t = conf->prev;
-                kfree(conf->hash_table);
-                kfree(conf);
-                conf = t;
-        } while (conf);
        return 0;
 }
@@ -322,6 +283,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
        const int rw = bio_data_dir(bio);
        mddev_t *mddev = q->queuedata;
        dev_info_t *tmp_dev;
+        sector_t start_sector;
        int cpu;
        if (unlikely(bio_barrier(bio))) {
@@ -335,33 +297,36 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
                      bio_sectors(bio));
        part_stat_unlock();
+        rcu_read_lock();
        tmp_dev = which_dev(mddev, bio->bi_sector);
-    
+        start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
-        if (unlikely(bio->bi_sector >= (tmp_dev->num_sectors +
-                                        tmp_dev->start_sector)
-                     || (bio->bi_sector <
+        if (unlikely(bio->bi_sector >= (tmp_dev->end_sector)
-                         tmp_dev->start_sector))) {
+                     || (bio->bi_sector < start_sector))) {
                char b[BDEVNAME_SIZE];
                printk("linear_make_request: Sector %llu out of bounds on "
                        "dev %s: %llu sectors, offset %llu\n",
                        (unsigned long long)bio->bi_sector,
                        bdevname(tmp_dev->rdev->bdev, b),
-                        (unsigned long long)tmp_dev->num_sectors,
+                        (unsigned long long)tmp_dev->rdev->sectors,
-                        (unsigned long long)tmp_dev->start_sector);
+                        (unsigned long long)start_sector);
+                rcu_read_unlock();
                bio_io_error(bio);
                return 0;
        }
        if (unlikely(bio->bi_sector + (bio->bi_size >> 9) >
-                     tmp_dev->start_sector + tmp_dev->num_sectors)) {
+                     tmp_dev->end_sector)) {
                /* This bio crosses a device boundary, so we have to
                 * split it.
                 */
                struct bio_pair *bp;
+                sector_t end_sector = tmp_dev->end_sector;
+                rcu_read_unlock();
-                bp = bio_split(bio,
+                bp = bio_split(bio, end_sector - bio->bi_sector);
-                               tmp_dev->start_sector + tmp_dev->num_sectors
-                               - bio->bi_sector);
                if (linear_make_request(q, &bp->bio1))
                        generic_make_request(&bp->bio1);
@@ -372,8 +337,9 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
        }
                    
        bio->bi_bdev = tmp_dev->rdev->bdev;
-        bio->bi_sector = bio->bi_sector - tmp_dev->start_sector
+        bio->bi_sector = bio->bi_sector - start_sector
                + tmp_dev->rdev->data_offset;
+        rcu_read_unlock();
        return 1;
 }
@@ -381,7 +347,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
 static void linear_status (struct seq_file *seq, mddev_t *mddev)
 {
-        seq_printf(seq, " %dk rounding", mddev->chunk_size/1024);
+        seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
 }
diff --git a/drivers/md/linear.h b/drivers/md/linear.h
index bf8179587f95..0ce29b61605a 100644
--- a/drivers/md/linear.h
+++ b/drivers/md/linear.h
@@ -3,27 +3,19 @@
 struct dev_info {
        mdk_rdev_t      *rdev;
-        sector_t        num_sectors;
+        sector_t        end_sector;
-        sector_t        start_sector;
 };
 typedef struct dev_info dev_info_t;
 struct linear_private_data
 {
-        struct linear_private_data *prev;       /* earlier version */
-        dev_info_t              **hash_table;
-        sector_t                spacing;
        sector_t                array_sectors;
-        int                     sector_shift;   /* shift before dividing
-                                                 * by spacing
-                                                 */
        dev_info_t              disks[0];
+        struct rcu_head         rcu;
 };
 typedef struct linear_private_data linear_conf_t;
-#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private)
 #endif
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 641b211fe3fe..09be637d52cb 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -440,15 +440,6 @@ static inline sector_t calc_dev_sboffset(struct block_device *bdev)
        return MD_NEW_SIZE_SECTORS(num_sectors);
 }
-static sector_t calc_num_sectors(mdk_rdev_t *rdev, unsigned chunk_size)
-{
-        sector_t num_sectors = rdev->sb_start;
-        if (chunk_size)
-                num_sectors &= ~((sector_t)chunk_size/512 - 1);
-        return num_sectors;
-}
 static int alloc_disk_sb(mdk_rdev_t * rdev)
 {
        if (rdev->sb_page)
@@ -745,6 +736,24 @@ struct super_type  {
 };
 /*
+ * Check that the given mddev has no bitmap.
+ *
+ * This function is called from the run method of all personalities that do not
+ * support bitmaps. It prints an error message and returns non-zero if mddev
+ * has a bitmap. Otherwise, it returns 0.
+ *
+ */
+int md_check_no_bitmap(mddev_t *mddev)
+{
+        if (!mddev->bitmap_file && !mddev->bitmap_offset)
+                return 0;
+        printk(KERN_ERR "%s: bitmaps are not supported for %s\n",
+                mdname(mddev), mddev->pers->name);
+        return 1;
+}
+EXPORT_SYMBOL(md_check_no_bitmap);
+/*
 * load_super for 0.90.0 
 */
 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
@@ -797,17 +806,6 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
        rdev->data_offset = 0;
        rdev->sb_size = MD_SB_BYTES;
-        if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) {
-                if (sb->level != 1 && sb->level != 4
-                    && sb->level != 5 && sb->level != 6
-                    && sb->level != 10) {
-                        /* FIXME use a better test */
-                        printk(KERN_WARNING
-                               "md: bitmaps not supported for this level.\n");
-                        goto abort;
-                }
-        }
        if (sb->level == LEVEL_MULTIPATH)
                rdev->desc_nr = -1;
        else
@@ -836,7 +834,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
                else 
                        ret = 0;
        }
-        rdev->sectors = calc_num_sectors(rdev, sb->chunk_size);
+        rdev->sectors = rdev->sb_start;
        if (rdev->sectors < sb->size * 2 && sb->level > 1)
                /* "this cannot possibly happen" ... */
@@ -866,7 +864,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                mddev->minor_version = sb->minor_version;
                mddev->patch_version = sb->patch_version;
                mddev->external = 0;
-                mddev->chunk_size = sb->chunk_size;
+                mddev->chunk_sectors = sb->chunk_size >> 9;
                mddev->ctime = sb->ctime;
                mddev->utime = sb->utime;
                mddev->level = sb->level;
@@ -883,13 +881,13 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                        mddev->delta_disks = sb->delta_disks;
                        mddev->new_level = sb->new_level;
                        mddev->new_layout = sb->new_layout;
-                        mddev->new_chunk = sb->new_chunk;
+                        mddev->new_chunk_sectors = sb->new_chunk >> 9;
                } else {
                        mddev->reshape_position = MaxSector;
                        mddev->delta_disks = 0;
                        mddev->new_level = mddev->level;
                        mddev->new_layout = mddev->layout;
-                        mddev->new_chunk = mddev->chunk_size;
+                        mddev->new_chunk_sectors = mddev->chunk_sectors;
                }
                if (sb->state & (1<<MD_SB_CLEAN))
@@ -1004,7 +1002,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
                sb->new_level = mddev->new_level;
                sb->delta_disks = mddev->delta_disks;
                sb->new_layout = mddev->new_layout;
-                sb->new_chunk = mddev->new_chunk;
+                sb->new_chunk = mddev->new_chunk_sectors << 9;
        }
        mddev->minor_version = sb->minor_version;
        if (mddev->in_sync)
@@ -1018,7 +1016,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
                sb->recovery_cp = 0;
        sb->layout = mddev->layout;
-        sb->chunk_size = mddev->chunk_size;
+        sb->chunk_size = mddev->chunk_sectors << 9;
        if (mddev->bitmap && mddev->bitmap_file == NULL)
                sb->state |= (1<<MD_SB_BITMAP_PRESENT);
@@ -1185,24 +1183,13 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
                       bdevname(rdev->bdev,b));
                return -EINVAL;
        }
-        if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) {
-                if (sb->level != cpu_to_le32(1) &&
-                    sb->level != cpu_to_le32(4) &&
-                    sb->level != cpu_to_le32(5) &&
-                    sb->level != cpu_to_le32(6) &&
-                    sb->level != cpu_to_le32(10)) {
-                        printk(KERN_WARNING
-                               "md: bitmaps not supported for this level.\n");
-                        return -EINVAL;
-                }
-        }
        rdev->preferred_minor = 0xffff;
        rdev->data_offset = le64_to_cpu(sb->data_offset);
        atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
        rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
-        bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1;
+        bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
        if (rdev->sb_size & bmask)
                rdev->sb_size = (rdev->sb_size | bmask) + 1;
@@ -1248,9 +1235,6 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
        if (rdev->sectors < le64_to_cpu(sb->data_size))
                return -EINVAL;
        rdev->sectors = le64_to_cpu(sb->data_size);
-        if (le32_to_cpu(sb->chunksize))
-                rdev->sectors &= ~((sector_t)le32_to_cpu(sb->chunksize) - 1);
        if (le64_to_cpu(sb->size) > rdev->sectors)
                return -EINVAL;
        return ret;
@@ -1271,7 +1255,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                mddev->major_version = 1;
                mddev->patch_version = 0;
                mddev->external = 0;
-                mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
+                mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
                mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
                mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
                mddev->level = le32_to_cpu(sb->level);
@@ -1297,13 +1281,13 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                        mddev->delta_disks = le32_to_cpu(sb->delta_disks);
                        mddev->new_level = le32_to_cpu(sb->new_level);
                        mddev->new_layout = le32_to_cpu(sb->new_layout);
-                        mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9;
+                        mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
                } else {
                        mddev->reshape_position = MaxSector;
                        mddev->delta_disks = 0;
                        mddev->new_level = mddev->level;
                        mddev->new_layout = mddev->layout;
-                        mddev->new_chunk = mddev->chunk_size;
+                        mddev->new_chunk_sectors = mddev->chunk_sectors;
                }
        } else if (mddev->pers == NULL) {
@@ -1375,7 +1359,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
        sb->raid_disks = cpu_to_le32(mddev->raid_disks);
        sb->size = cpu_to_le64(mddev->dev_sectors);
-        sb->chunksize = cpu_to_le32(mddev->chunk_size >> 9);
+        sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
        sb->level = cpu_to_le32(mddev->level);
        sb->layout = cpu_to_le32(mddev->layout);
@@ -1402,7 +1386,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
                sb->new_layout = cpu_to_le32(mddev->new_layout);
                sb->delta_disks = cpu_to_le32(mddev->delta_disks);
                sb->new_level = cpu_to_le32(mddev->new_level);
-                sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9);
+                sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
        }
        max_dev = 0;
@@ -1897,6 +1881,7 @@ static void md_update_sb(mddev_t * mddev, int force_change)
        int sync_req;
        int nospares = 0;
+        mddev->utime = get_seconds();
        if (mddev->external)
                return;
 repeat:
@@ -1926,7 +1911,6 @@ repeat:
                nospares = 0;
        sync_req = mddev->in_sync;
-        mddev->utime = get_seconds();
        /* If this is just a dirty<->clean transition, and the array is clean
         * and 'events' is odd, we can roll back to the previous clean state */
@@ -2597,15 +2581,6 @@ static void analyze_sbs(mddev_t * mddev)
                        clear_bit(In_sync, &rdev->flags);
                }
        }
-        if (mddev->recovery_cp != MaxSector &&
-            mddev->level >= 1)
-                printk(KERN_ERR "md: %s: raid array is not clean"
-                       " -- starting background reconstruction\n",
-                       mdname(mddev));
 }
 static void md_safemode_timeout(unsigned long data);
@@ -2746,7 +2721,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
        if (IS_ERR(priv)) {
                mddev->new_level = mddev->level;
                mddev->new_layout = mddev->layout;
-                mddev->new_chunk = mddev->chunk_size;
+                mddev->new_chunk_sectors = mddev->chunk_sectors;
                mddev->raid_disks -= mddev->delta_disks;
                mddev->delta_disks = 0;
                module_put(pers->owner);
@@ -2764,7 +2739,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
        strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
        mddev->level = mddev->new_level;
        mddev->layout = mddev->new_layout;
-        mddev->chunk_size = mddev->new_chunk;
+        mddev->chunk_sectors = mddev->new_chunk_sectors;
        mddev->delta_disks = 0;
        pers->run(mddev);
        mddev_resume(mddev);
@@ -2800,11 +2775,14 @@ layout_store(mddev_t *mddev, const char *buf, size_t len)
        if (mddev->pers) {
                int err;
-                if (mddev->pers->reconfig == NULL)
+                if (mddev->pers->check_reshape == NULL)
                        return -EBUSY;
-                err = mddev->pers->reconfig(mddev, n, -1);
+                mddev->new_layout = n;
-                if (err)
+                err = mddev->pers->check_reshape(mddev);
+                if (err) {
+                        mddev->new_layout = mddev->layout;
                        return err;
+                }
        } else {
                mddev->new_layout = n;
                if (mddev->reshape_position == MaxSector)
@@ -2857,10 +2835,11 @@ static ssize_t
 chunk_size_show(mddev_t *mddev, char *page)
 {
        if (mddev->reshape_position != MaxSector &&
-            mddev->chunk_size != mddev->new_chunk)
+            mddev->chunk_sectors != mddev->new_chunk_sectors)
-                return sprintf(page, "%d (%d)\n", mddev->new_chunk,
+                return sprintf(page, "%d (%d)\n",
-                               mddev->chunk_size);
+                               mddev->new_chunk_sectors << 9,
-        return sprintf(page, "%d\n", mddev->chunk_size);
+                               mddev->chunk_sectors << 9);
+        return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
 }
 static ssize_t
@@ -2874,15 +2853,18 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
        if (mddev->pers) {
                int err;
-                if (mddev->pers->reconfig == NULL)
+                if (mddev->pers->check_reshape == NULL)
                        return -EBUSY;
-                err = mddev->pers->reconfig(mddev, -1, n);
+                mddev->new_chunk_sectors = n >> 9;
-                if (err)
+                err = mddev->pers->check_reshape(mddev);
+                if (err) {
+                        mddev->new_chunk_sectors = mddev->chunk_sectors;
                        return err;
+                }
        } else {
-                mddev->new_chunk = n;
+                mddev->new_chunk_sectors = n >> 9;
                if (mddev->reshape_position == MaxSector)
-                        mddev->chunk_size = n;
+                        mddev->chunk_sectors = n >> 9;
        }
        return len;
 }
@@ -3527,8 +3509,9 @@ min_sync_store(mddev_t *mddev, const char *buf, size_t len)
                return -EBUSY;
        /* Must be a multiple of chunk_size */
-        if (mddev->chunk_size) {
+        if (mddev->chunk_sectors) {
-                if (min & (sector_t)((mddev->chunk_size>>9)-1))
+                sector_t temp = min;
+                if (sector_div(temp, mddev->chunk_sectors))
                        return -EINVAL;
        }
        mddev->resync_min = min;
@@ -3564,8 +3547,9 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len)
                        return -EBUSY;
                /* Must be a multiple of chunk_size */
-                if (mddev->chunk_size) {
+                if (mddev->chunk_sectors) {
-                        if (max & (sector_t)((mddev->chunk_size>>9)-1))
+                        sector_t temp = max;
+                        if (sector_div(temp, mddev->chunk_sectors))
                                return -EINVAL;
                }
                mddev->resync_max = max;
@@ -3656,7 +3640,7 @@ reshape_position_store(mddev_t *mddev, const char *buf, size_t len)
        mddev->delta_disks = 0;
        mddev->new_level = mddev->level;
        mddev->new_layout = mddev->layout;
-        mddev->new_chunk = mddev->chunk_size;
+        mddev->new_chunk_sectors = mddev->chunk_sectors;
        return len;
 }
@@ -3976,11 +3960,9 @@ static int start_dirty_degraded;
 static int do_md_run(mddev_t * mddev)
 {
        int err;
-        int chunk_size;
        mdk_rdev_t *rdev;
        struct gendisk *disk;
        struct mdk_personality *pers;
-        char b[BDEVNAME_SIZE];
        if (list_empty(&mddev->disks))
                /* cannot run an array with no devices.. */
@@ -3998,38 +3980,6 @@ static int do_md_run(mddev_t * mddev)
                analyze_sbs(mddev);
        }
-        chunk_size = mddev->chunk_size;
-        if (chunk_size) {
-                if (chunk_size > MAX_CHUNK_SIZE) {
-                        printk(KERN_ERR "too big chunk_size: %d > %d\n",
-                                chunk_size, MAX_CHUNK_SIZE);
-                        return -EINVAL;
-                }
-                /*
-                 * chunk-size has to be a power of 2
-                 */
-                if ( (1 << ffz(~chunk_size)) != chunk_size) {
-                        printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size);
-                        return -EINVAL;
-                }
-                /* devices must have minimum size of one chunk */
-                list_for_each_entry(rdev, &mddev->disks, same_set) {
-                        if (test_bit(Faulty, &rdev->flags))
-                                continue;
-                        if (rdev->sectors < chunk_size / 512) {
-                                printk(KERN_WARNING
-                                        "md: Dev %s smaller than chunk_size:"
-                                        " %llu < %d\n",
-                                        bdevname(rdev->bdev,b),
-                                        (unsigned long long)rdev->sectors,
-                                        chunk_size / 512);
-                                return -EINVAL;
-                        }
-                }
-        }
        if (mddev->level != LEVEL_NONE)
                request_module("md-level-%d", mddev->level);
        else if (mddev->clevel[0])
@@ -4405,7 +4355,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
                mddev->flags = 0;
                mddev->ro = 0;
                mddev->metadata_type[0] = 0;
-                mddev->chunk_size = 0;
+                mddev->chunk_sectors = 0;
                mddev->ctime = mddev->utime = 0;
                mddev->layout = 0;
                mddev->max_disks = 0;
@@ -4413,7 +4363,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
                mddev->delta_disks = 0;
                mddev->new_level = LEVEL_NONE;
                mddev->new_layout = 0;
-                mddev->new_chunk = 0;
+                mddev->new_chunk_sectors = 0;
                mddev->curr_resync = 0;
                mddev->resync_mismatches = 0;
                mddev->suspend_lo = mddev->suspend_hi = 0;
@@ -4618,7 +4568,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
        info.spare_disks   = spare;
        info.layout        = mddev->layout;
-        info.chunk_size    = mddev->chunk_size;
+        info.chunk_size    = mddev->chunk_sectors << 9;
        if (copy_to_user(arg, &info, sizeof(info)))
                return -EFAULT;
@@ -4843,7 +4793,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
                        rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
                } else 
                        rdev->sb_start = calc_dev_sboffset(rdev->bdev);
-                rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size);
+                rdev->sectors = rdev->sb_start;
                err = bind_rdev_to_array(rdev, mddev);
                if (err) {
@@ -4913,7 +4863,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
        else
                rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
-        rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size);
+        rdev->sectors = rdev->sb_start;
        if (test_bit(Faulty, &rdev->flags)) {
                printk(KERN_WARNING 
@@ -5062,7 +5012,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
        mddev->external      = 0;
        mddev->layout        = info->layout;
-        mddev->chunk_size    = info->chunk_size;
+        mddev->chunk_sectors = info->chunk_size >> 9;
        mddev->max_disks     = MD_SB_DISKS;
@@ -5081,7 +5031,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
        get_random_bytes(mddev->uuid, 16);
        mddev->new_level = mddev->level;
-        mddev->new_chunk = mddev->chunk_size;
+        mddev->new_chunk_sectors = mddev->chunk_sectors;
        mddev->new_layout = mddev->layout;
        mddev->delta_disks = 0;
@@ -5191,7 +5141,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
            mddev->level         != info->level         ||
 /*          mddev->layout        != info->layout        || */
            !mddev->persistent   != info->not_persistent||
-            mddev->chunk_size    != info->chunk_size    ||
+            mddev->chunk_sectors != info->chunk_size >> 9 ||
            /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
            ((state^info->state) & 0xfffffe00)
                )
@@ -5215,10 +5165,15 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
                 * we don't need to do anything at the md level, the
                 * personality will take care of it all.
                 */
-                if (mddev->pers->reconfig == NULL)
+                if (mddev->pers->check_reshape == NULL)
                        return -EINVAL;
-                else
+                else {
-                        return mddev->pers->reconfig(mddev, info->layout, -1);
+                        mddev->new_layout = info->layout;
+                        rv = mddev->pers->check_reshape(mddev);
+                        if (rv)
+                                mddev->new_layout = mddev->layout;
+                        return rv;
+                }
        }
        if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
                rv = update_size(mddev, (sector_t)info->size * 2);
@@ -6717,7 +6672,8 @@ void md_check_recovery(mddev_t *mddev)
                 */
                if (mddev->reshape_position != MaxSector) {
-                        if (mddev->pers->check_reshape(mddev) != 0)
+                        if (mddev->pers->check_reshape == NULL ||
+                            mddev->pers->check_reshape(mddev) != 0)
                                /* Cannot proceed */
                                goto unlock;
                        set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 8227ab909d44..9430a110db93 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -30,13 +30,6 @@ typedef struct mddev_s mddev_t;
 typedef struct mdk_rdev_s mdk_rdev_t;
 /*
- * options passed in raidrun:
- */
-/* Currently this must fit in an 'int' */
-#define MAX_CHUNK_SIZE (1<<30)
-/*
 * MD's 'extended' device
 */
 struct mdk_rdev_s
@@ -145,7 +138,7 @@ struct mddev_s
        int                             external;       /* metadata is
                                                         * managed externally */
        char                            metadata_type[17]; /* externally set*/
-        int                             chunk_size;
+        int                             chunk_sectors;
        time_t                          ctime, utime;
        int                             level, layout;
        char                            clevel[16];
@@ -166,7 +159,8 @@ struct mddev_s
         * If reshape_position is MaxSector, then no reshape is happening (yet).
         */
        sector_t                        reshape_position;
-        int                             delta_disks, new_level, new_layout, new_chunk;
+        int                             delta_disks, new_level, new_layout;
+        int                             new_chunk_sectors;
        struct mdk_thread_s             *thread;        /* management thread */
        struct mdk_thread_s             *sync_thread;   /* doing resync or reconstruct */
@@ -325,7 +319,6 @@ struct mdk_personality
        int (*check_reshape) (mddev_t *mddev);
        int (*start_reshape) (mddev_t *mddev);
        void (*finish_reshape) (mddev_t *mddev);
-        int (*reconfig) (mddev_t *mddev, int layout, int chunk_size);
        /* quiesce moves between quiescence states
         * 0 - fully active
         * 1 - no new requests allowed
@@ -437,5 +430,6 @@ extern void md_new_event(mddev_t *mddev);
 extern int md_allow_write(mddev_t *mddev);
 extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
 extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors);
+extern int md_check_no_bitmap(mddev_t *mddev);
 #endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 41ced0cbe823..cbe368fa6598 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -58,7 +58,7 @@ static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
 {
        unsigned long flags;
        mddev_t *mddev = mp_bh->mddev;
-        multipath_conf_t *conf = mddev_to_conf(mddev);
+        multipath_conf_t *conf = mddev->private;
        spin_lock_irqsave(&conf->device_lock, flags);
        list_add(&mp_bh->retry_list, &conf->retry_list);
@@ -75,7 +75,7 @@ static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
 static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
 {
        struct bio *bio = mp_bh->master_bio;
-        multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev);
+        multipath_conf_t *conf = mp_bh->mddev->private;
        bio_endio(bio, err);
        mempool_free(mp_bh, conf->pool);
@@ -85,7 +85,7 @@ static void multipath_end_request(struct bio *bio, int error)
 {
        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
        struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private);
-        multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev);
+        multipath_conf_t *conf = mp_bh->mddev->private;
        mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev;
        if (uptodate)
@@ -107,7 +107,7 @@ static void multipath_end_request(struct bio *bio, int error)
 static void unplug_slaves(mddev_t *mddev)
 {
-        multipath_conf_t *conf = mddev_to_conf(mddev);
+        multipath_conf_t *conf = mddev->private;
        int i;
        rcu_read_lock();
@@ -138,7 +138,7 @@ static void multipath_unplug(struct request_queue *q)
 static int multipath_make_request (struct request_queue *q, struct bio * bio)
 {
        mddev_t *mddev = q->queuedata;
-        multipath_conf_t *conf = mddev_to_conf(mddev);
+        multipath_conf_t *conf = mddev->private;
        struct multipath_bh * mp_bh;
        struct multipath_info *multipath;
        const int rw = bio_data_dir(bio);
@@ -180,7 +180,7 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio)
 static void multipath_status (struct seq_file *seq, mddev_t *mddev)
 {
-        multipath_conf_t *conf = mddev_to_conf(mddev);
+        multipath_conf_t *conf = mddev->private;
        int i;
        
        seq_printf (seq, " [%d/%d] [", conf->raid_disks,
@@ -195,7 +195,7 @@ static void multipath_status (struct seq_file *seq, mddev_t *mddev)
 static int multipath_congested(void *data, int bits)
 {
        mddev_t *mddev = data;
-        multipath_conf_t *conf = mddev_to_conf(mddev);
+        multipath_conf_t *conf = mddev->private;
        int i, ret = 0;
        rcu_read_lock();
@@ -220,7 +220,7 @@ static int multipath_congested(void *data, int bits)
 */
 static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
 {
-        multipath_conf_t *conf = mddev_to_conf(mddev);
+        multipath_conf_t *conf = mddev->private;
        if (conf->working_disks <= 1) {
                /*
@@ -303,7 +303,7 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
                 * merge_bvec_fn will be involved in multipath.)
                 */
                        if (q->merge_bvec_fn &&
-                            mddev->queue->max_sectors > (PAGE_SIZE>>9))
+                            queue_max_sectors(q) > (PAGE_SIZE>>9))
                                blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
                        conf->working_disks++;
@@ -367,7 +367,7 @@ static void multipathd (mddev_t *mddev)
        struct multipath_bh *mp_bh;
        struct bio *bio;
        unsigned long flags;
-        multipath_conf_t *conf = mddev_to_conf(mddev);
+        multipath_conf_t *conf = mddev->private;
        struct list_head *head = &conf->retry_list;
        md_check_recovery(mddev);
@@ -421,6 +421,9 @@ static int multipath_run (mddev_t *mddev)
        struct multipath_info *disk;
        mdk_rdev_t *rdev;
+        if (md_check_no_bitmap(mddev))
+                return -EINVAL;
        if (mddev->level != LEVEL_MULTIPATH) {
                printk("multipath: %s: raid level not set to multipath IO (%d)\n",
                       mdname(mddev), mddev->level);
@@ -467,7 +470,7 @@ static int multipath_run (mddev_t *mddev)
                 * violating it, not that we ever expect a device with
                 * a merge_bvec_fn to be involved in multipath */
                if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-                    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+                    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
                        blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
                if (!test_bit(Faulty, &rdev->flags))
@@ -531,7 +534,7 @@ out:
 static int multipath_stop (mddev_t *mddev)
 {
-        multipath_conf_t *conf = mddev_to_conf(mddev);
+        multipath_conf_t *conf = mddev->private;
        md_unregister_thread(mddev->thread);
        mddev->thread = NULL;
diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h
index 6fa70b400cda..d1c2a8d78395 100644
--- a/drivers/md/multipath.h
+++ b/drivers/md/multipath.h
@@ -19,12 +19,6 @@ struct multipath_private_data {
 typedef struct multipath_private_data multipath_conf_t;
 /*
- * this is the only point in the RAID code where we violate
- * C type safety. mddev->private is an 'opaque' pointer.
- */
-#define mddev_to_conf(mddev) ((multipath_conf_t *) mddev->private)
-/*
 * this is our 'private' 'collective' MULTIPATH buffer head.
 * it contains information about what kind of IO operations were started
 * for this MULTIPATH operation, and about their status:
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index c08d7559be55..ab4a489d8695 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -26,8 +26,8 @@
 static void raid0_unplug(struct request_queue *q)
 {
        mddev_t *mddev = q->queuedata;
-        raid0_conf_t *conf = mddev_to_conf(mddev);
+        raid0_conf_t *conf = mddev->private;
-        mdk_rdev_t **devlist = conf->strip_zone[0].dev;
+        mdk_rdev_t **devlist = conf->devlist;
        int i;
        for (i=0; i<mddev->raid_disks; i++) {
@@ -40,8 +40,8 @@ static void raid0_unplug(struct request_queue *q)
 static int raid0_congested(void *data, int bits)
 {
        mddev_t *mddev = data;
-        raid0_conf_t *conf = mddev_to_conf(mddev);
+        raid0_conf_t *conf = mddev->private;
-        mdk_rdev_t **devlist = conf->strip_zone[0].dev;
+        mdk_rdev_t **devlist = conf->devlist;
        int i, ret = 0;
        for (i = 0; i < mddev->raid_disks && !ret ; i++) {
@@ -52,27 +52,60 @@ static int raid0_congested(void *data, int bits)
        return ret;
 }
+/*
+ * inform the user of the raid configuration
+*/
+static void dump_zones(mddev_t *mddev)
+{
+        int j, k, h;
+        sector_t zone_size = 0;
+        sector_t zone_start = 0;
+        char b[BDEVNAME_SIZE];
+        raid0_conf_t *conf = mddev->private;
+        printk(KERN_INFO "******* %s configuration *********\n",
+                mdname(mddev));
+        h = 0;
+        for (j = 0; j < conf->nr_strip_zones; j++) {
+                printk(KERN_INFO "zone%d=[", j);
+                for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
+                        printk("%s/",
+                        bdevname(conf->devlist[j*mddev->raid_disks
+                                                + k]->bdev, b));
+                printk("]\n");
+                zone_size  = conf->strip_zone[j].zone_end - zone_start;
+                printk(KERN_INFO "        zone offset=%llukb "
+                                "device offset=%llukb size=%llukb\n",
+                        (unsigned long long)zone_start>>1,
+                        (unsigned long long)conf->strip_zone[j].dev_start>>1,
+                        (unsigned long long)zone_size>>1);
+                zone_start = conf->strip_zone[j].zone_end;
+        }
+        printk(KERN_INFO "**********************************\n\n");
+}
-static int create_strip_zones (mddev_t *mddev)
+static int create_strip_zones(mddev_t *mddev)
 {
-        int i, c, j;
+        int i, c, j, err;
-        sector_t current_start, curr_zone_start;
+        sector_t curr_zone_end, sectors;
-        sector_t min_spacing;
+        mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev, **dev;
-        raid0_conf_t *conf = mddev_to_conf(mddev);
-        mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
        struct strip_zone *zone;
        int cnt;
        char b[BDEVNAME_SIZE];
- 
+        raid0_conf_t *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
-        /*
-         * The number of 'same size groups'
+        if (!conf)
-         */
+                return -ENOMEM;
-        conf->nr_strip_zones = 0;
- 
        list_for_each_entry(rdev1, &mddev->disks, same_set) {
                printk(KERN_INFO "raid0: looking at %s\n",
                        bdevname(rdev1->bdev,b));
                c = 0;
+                /* round size to chunk_size */
+                sectors = rdev1->sectors;
+                sector_div(sectors, mddev->chunk_sectors);
+                rdev1->sectors = sectors * mddev->chunk_sectors;
                list_for_each_entry(rdev2, &mddev->disks, same_set) {
                        printk(KERN_INFO "raid0:   comparing %s(%llu)",
                               bdevname(rdev1->bdev,b),
@@ -103,16 +136,16 @@ static int create_strip_zones (mddev_t *mddev)
                }
        }
        printk(KERN_INFO "raid0: FINAL %d zones\n", conf->nr_strip_zones);
+        err = -ENOMEM;
        conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
                                conf->nr_strip_zones, GFP_KERNEL);
        if (!conf->strip_zone)
-                return 1;
+                goto abort;
        conf->devlist = kzalloc(sizeof(mdk_rdev_t*)*
                                conf->nr_strip_zones*mddev->raid_disks,
                                GFP_KERNEL);
        if (!conf->devlist)
-                return 1;
+                goto abort;
        /* The first zone must contain all devices, so here we check that
         * there is a proper alignment of slots to devices and find them all
@@ -120,7 +153,8 @@ static int create_strip_zones (mddev_t *mddev)
        zone = &conf->strip_zone[0];
        cnt = 0;
        smallest = NULL;
-        zone->dev = conf->devlist;
+        dev = conf->devlist;
+        err = -EINVAL;
        list_for_each_entry(rdev1, &mddev->disks, same_set) {
                int j = rdev1->raid_disk;
@@ -129,12 +163,12 @@ static int create_strip_zones (mddev_t *mddev)
                                "aborting!\n", j);
                        goto abort;
                }
-                if (zone->dev[j]) {
+                if (dev[j]) {
                        printk(KERN_ERR "raid0: multiple devices for %d - "
                                "aborting!\n", j);
                        goto abort;
                }
-                zone->dev[j] = rdev1;
+                dev[j] = rdev1;
                blk_queue_stack_limits(mddev->queue,
                                       rdev1->bdev->bd_disk->queue);
@@ -144,7 +178,7 @@ static int create_strip_zones (mddev_t *mddev)
                 */
                if (rdev1->bdev->bd_disk->queue->merge_bvec_fn &&
-                    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+                    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
                        blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
                if (!smallest || (rdev1->sectors < smallest->sectors))
@@ -157,34 +191,32 @@ static int create_strip_zones (mddev_t *mddev)
                goto abort;
        }
        zone->nb_dev = cnt;
-        zone->sectors = smallest->sectors * cnt;
+        zone->zone_end = smallest->sectors * cnt;
-        zone->zone_start = 0;
-        current_start = smallest->sectors;
+        curr_zone_end = zone->zone_end;
-        curr_zone_start = zone->sectors;
        /* now do the other zones */
        for (i = 1; i < conf->nr_strip_zones; i++)
        {
                zone = conf->strip_zone + i;
-                zone->dev = conf->strip_zone[i-1].dev + mddev->raid_disks;
+                dev = conf->devlist + i * mddev->raid_disks;
                printk(KERN_INFO "raid0: zone %d\n", i);
-                zone->dev_start = current_start;
+                zone->dev_start = smallest->sectors;
                smallest = NULL;
                c = 0;
                for (j=0; j<cnt; j++) {
                        char b[BDEVNAME_SIZE];
-                        rdev = conf->strip_zone[0].dev[j];
+                        rdev = conf->devlist[j];
                        printk(KERN_INFO "raid0: checking %s ...",
                                bdevname(rdev->bdev, b));
-                        if (rdev->sectors <= current_start) {
+                        if (rdev->sectors <= zone->dev_start) {
                                printk(KERN_INFO " nope.\n");
                                continue;
                        }
                        printk(KERN_INFO " contained as device %d\n", c);
-                        zone->dev[c] = rdev;
+                        dev[c] = rdev;
                        c++;
                        if (!smallest || rdev->sectors < smallest->sectors) {
                                smallest = rdev;
@@ -194,47 +226,39 @@ static int create_strip_zones (mddev_t *mddev)
                }
                zone->nb_dev = c;
-                zone->sectors = (smallest->sectors - current_start) * c;
+                sectors = (smallest->sectors - zone->dev_start) * c;
                printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n",
-                        zone->nb_dev, (unsigned long long)zone->sectors);
+                        zone->nb_dev, (unsigned long long)sectors);
-                zone->zone_start = curr_zone_start;
+                curr_zone_end += sectors;
-                curr_zone_start += zone->sectors;
+                zone->zone_end = curr_zone_end;
-                current_start = smallest->sectors;
                printk(KERN_INFO "raid0: current zone start: %llu\n",
-                        (unsigned long long)current_start);
+                        (unsigned long long)smallest->sectors);
-        }
-        /* Now find appropriate hash spacing.
-         * We want a number which causes most hash entries to cover
-         * at most two strips, but the hash table must be at most
-         * 1 PAGE.  We choose the smallest strip, or contiguous collection
-         * of strips, that has big enough size.  We never consider the last
-         * strip though as it's size has no bearing on the efficacy of the hash
-         * table.
-         */
-        conf->spacing = curr_zone_start;
-        min_spacing = curr_zone_start;
-        sector_div(min_spacing, PAGE_SIZE/sizeof(struct strip_zone*));
-        for (i=0; i < conf->nr_strip_zones-1; i++) {
-                sector_t s = 0;
-                for (j = i; j < conf->nr_strip_zones - 1 &&
-                                s < min_spacing; j++)
-                        s += conf->strip_zone[j].sectors;
-                if (s >= min_spacing && s < conf->spacing)
-                        conf->spacing = s;
        }
        mddev->queue->unplug_fn = raid0_unplug;
        mddev->queue->backing_dev_info.congested_fn = raid0_congested;
        mddev->queue->backing_dev_info.congested_data = mddev;
+        /*
+         * now since we have the hard sector sizes, we can make sure
+         * chunk size is a multiple of that sector size
+         */
+        if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) {
+                printk(KERN_ERR "%s chunk_size of %d not valid\n",
+                       mdname(mddev),
+                       mddev->chunk_sectors << 9);
+                goto abort;
+        }
        printk(KERN_INFO "raid0: done.\n");
+        mddev->private = conf;
        return 0;
- abort:
+abort:
-        return 1;
+        kfree(conf->strip_zone);
+        kfree(conf->devlist);
+        kfree(conf);
+        mddev->private = NULL;
+        return err;
 }
 /**
@@ -252,10 +276,15 @@ static int raid0_mergeable_bvec(struct request_queue *q,
        mddev_t *mddev = q->queuedata;
        sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
        int max;
-        unsigned int chunk_sectors = mddev->chunk_size >> 9;
+        unsigned int chunk_sectors = mddev->chunk_sectors;
        unsigned int bio_sectors = bvm->bi_size >> 9;
-        max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
+        if (is_power_of_2(chunk_sectors))
+                max =  (chunk_sectors - ((sector & (chunk_sectors-1))
+                                                + bio_sectors)) << 9;
+        else
+                max =  (chunk_sectors - (sector_div(sector, chunk_sectors)
+                                                + bio_sectors)) << 9;
        if (max < 0) max = 0; /* bio_add cannot handle a negative return */
        if (max <= biovec->bv_len && bio_sectors == 0)
                return biovec->bv_len;
@@ -277,84 +306,28 @@ static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks)
        return array_sectors;
 }
-static int raid0_run (mddev_t *mddev)
+static int raid0_run(mddev_t *mddev)
 {
-        unsigned  cur=0, i=0, nb_zone;
+        int ret;
-        s64 sectors;
-        raid0_conf_t *conf;
-        if (mddev->chunk_size == 0) {
+        if (mddev->chunk_sectors == 0) {
-                printk(KERN_ERR "md/raid0: non-zero chunk size required.\n");
+                printk(KERN_ERR "md/raid0: chunk size must be set.\n");
                return -EINVAL;
        }
-        printk(KERN_INFO "%s: setting max_sectors to %d, segment boundary to %d\n",
+        if (md_check_no_bitmap(mddev))
-               mdname(mddev),
+                return -EINVAL;
-               mddev->chunk_size >> 9,
+        blk_queue_max_sectors(mddev->queue, mddev->chunk_sectors);
-               (mddev->chunk_size>>1)-1);
-        blk_queue_max_sectors(mddev->queue, mddev->chunk_size >> 9);
-        blk_queue_segment_boundary(mddev->queue, (mddev->chunk_size>>1) - 1);
        mddev->queue->queue_lock = &mddev->queue->__queue_lock;
-        conf = kmalloc(sizeof (raid0_conf_t), GFP_KERNEL);
+        ret = create_strip_zones(mddev);
-        if (!conf)
+        if (ret < 0)
-                goto out;
+                return ret;
-        mddev->private = (void *)conf;
- 
-        conf->strip_zone = NULL;
-        conf->devlist = NULL;
-        if (create_strip_zones (mddev)) 
-                goto out_free_conf;
        /* calculate array device size */
        md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
        printk(KERN_INFO "raid0 : md_size is %llu sectors.\n",
                (unsigned long long)mddev->array_sectors);
-        printk(KERN_INFO "raid0 : conf->spacing is %llu sectors.\n",
-                (unsigned long long)conf->spacing);
-        {
-                sector_t s = raid0_size(mddev, 0, 0);
-                sector_t space = conf->spacing;
-                int round;
-                conf->sector_shift = 0;
-                if (sizeof(sector_t) > sizeof(u32)) {
-                        /*shift down space and s so that sector_div will work */
-                        while (space > (sector_t) (~(u32)0)) {
-                                s >>= 1;
-                                space >>= 1;
-                                s += 1; /* force round-up */
-                                conf->sector_shift++;
-                        }
-                }
-                round = sector_div(s, (u32)space) ? 1 : 0;
-                nb_zone = s + round;
-        }
-        printk(KERN_INFO "raid0 : nb_zone is %d.\n", nb_zone);
-        printk(KERN_INFO "raid0 : Allocating %zu bytes for hash.\n",
-                                nb_zone*sizeof(struct strip_zone*));
-        conf->hash_table = kmalloc (sizeof (struct strip_zone *)*nb_zone, GFP_KERNEL);
-        if (!conf->hash_table)
-                goto out_free_conf;
-        sectors = conf->strip_zone[cur].sectors;
-        conf->hash_table[0] = conf->strip_zone + cur;
-        for (i=1; i< nb_zone; i++) {
-                while (sectors <= conf->spacing) {
-                        cur++;
-                        sectors += conf->strip_zone[cur].sectors;
-                }
-                sectors -= conf->spacing;
-                conf->hash_table[i] = conf->strip_zone + cur;
-        }
-        if (conf->sector_shift) {
-                conf->spacing >>= conf->sector_shift;
-                /* round spacing up so when we divide by it, we
-                 * err on the side of too-low, which is safest
-                 */
-                conf->spacing++;
-        }
        /* calculate the max read-ahead size.
         * For read-ahead of large files to be effective, we need to
         * readahead at least twice a whole stripe. i.e. number of devices
@@ -365,48 +338,107 @@ static int raid0_run (mddev_t *mddev)
         * chunksize should be used in that case.
         */
        {
-                int stripe = mddev->raid_disks * mddev->chunk_size / PAGE_SIZE;
+                int stripe = mddev->raid_disks *
+                        (mddev->chunk_sectors << 9) / PAGE_SIZE;
                if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
                        mddev->queue->backing_dev_info.ra_pages = 2* stripe;
        }
        blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
+        dump_zones(mddev);
        return 0;
+}
-out_free_conf:
+static int raid0_stop(mddev_t *mddev)
+{
+        raid0_conf_t *conf = mddev->private;
+        blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
        kfree(conf->strip_zone);
        kfree(conf->devlist);
        kfree(conf);
        mddev->private = NULL;
-out:
+        return 0;
-        return -ENOMEM;
 }
-static int raid0_stop (mddev_t *mddev)
+/* Find the zone which holds a particular offset
+ * Update *sectorp to be an offset in that zone
+ */
+static struct strip_zone *find_zone(struct raid0_private_data *conf,
+                                    sector_t *sectorp)
 {
-        raid0_conf_t *conf = mddev_to_conf(mddev);
+        int i;
+        struct strip_zone *z = conf->strip_zone;
+        sector_t sector = *sectorp;
+        for (i = 0; i < conf->nr_strip_zones; i++)
+                if (sector < z[i].zone_end) {
+                        if (i)
+                                *sectorp = sector - z[i-1].zone_end;
+                        return z + i;
+                }
+        BUG();
+}
-        blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
+/*
-        kfree(conf->hash_table);
+ * remaps the bio to the target device. we separate two flows.
-        conf->hash_table = NULL;
+ * power 2 flow and a general flow for the sake of perfromance
-        kfree(conf->strip_zone);
+*/
-        conf->strip_zone = NULL;
+static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone,
-        kfree(conf);
+                                sector_t sector, sector_t *sector_offset)
-        mddev->private = NULL;
+{
+        unsigned int sect_in_chunk;
+        sector_t chunk;
+        raid0_conf_t *conf = mddev->private;
+        unsigned int chunk_sects = mddev->chunk_sectors;
+        if (is_power_of_2(chunk_sects)) {
+                int chunksect_bits = ffz(~chunk_sects);
+                /* find the sector offset inside the chunk */
+                sect_in_chunk  = sector & (chunk_sects - 1);
+                sector >>= chunksect_bits;
+                /* chunk in zone */
+                chunk = *sector_offset;
+                /* quotient is the chunk in real device*/
+                sector_div(chunk, zone->nb_dev << chunksect_bits);
+        } else{
+                sect_in_chunk = sector_div(sector, chunk_sects);
+                chunk = *sector_offset;
+                sector_div(chunk, chunk_sects * zone->nb_dev);
+        }
+        /*
+        *  position the bio over the real device
+        *  real sector = chunk in device + starting of zone
+        *       + the position in the chunk
+        */
+        *sector_offset = (chunk * chunk_sects) + sect_in_chunk;
+        return conf->devlist[(zone - conf->strip_zone)*mddev->raid_disks
+                             + sector_div(sector, zone->nb_dev)];
+}
-        return 0;
+/*
+ * Is io distribute over 1 or more chunks ?
+*/
+static inline int is_io_in_chunk_boundary(mddev_t *mddev,
+                        unsigned int chunk_sects, struct bio *bio)
+{
+        if (likely(is_power_of_2(chunk_sects))) {
+                return chunk_sects >= ((bio->bi_sector & (chunk_sects-1))
+                                        + (bio->bi_size >> 9));
+        } else{
+                sector_t sector = bio->bi_sector;
+                return chunk_sects >= (sector_div(sector, chunk_sects)
+                                                + (bio->bi_size >> 9));
+        }
 }
-static int raid0_make_request (struct request_queue *q, struct bio *bio)
+static int raid0_make_request(struct request_queue *q, struct bio *bio)
 {
        mddev_t *mddev = q->queuedata;
-        unsigned int sect_in_chunk, chunksect_bits, chunk_sects;
+        unsigned int chunk_sects;
-        raid0_conf_t *conf = mddev_to_conf(mddev);
+        sector_t sector_offset;
        struct strip_zone *zone;
        mdk_rdev_t *tmp_dev;
-        sector_t chunk;
-        sector_t sector, rsect;
        const int rw = bio_data_dir(bio);
        int cpu;
@@ -421,11 +453,9 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
                      bio_sectors(bio));
        part_stat_unlock();
-        chunk_sects = mddev->chunk_size >> 9;
+        chunk_sects = mddev->chunk_sectors;
-        chunksect_bits = ffz(~chunk_sects);
+        if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) {
-        sector = bio->bi_sector;
+                sector_t sector = bio->bi_sector;
-        if (unlikely(chunk_sects < (bio->bi_sector & (chunk_sects - 1)) + (bio->bi_size >> 9))) {
                struct bio_pair *bp;
                /* Sanity check -- queue functions should prevent this happening */
                if (bio->bi_vcnt != 1 ||
@@ -434,7 +464,12 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
                /* This is a one page bio that upper layers
                 * refuse to split for us, so we need to split it.
                 */
-                bp = bio_split(bio, chunk_sects - (bio->bi_sector & (chunk_sects - 1)));
+                if (likely(is_power_of_2(chunk_sects)))
+                        bp = bio_split(bio, chunk_sects - (sector &
+                                                           (chunk_sects-1)));
+                else
+                        bp = bio_split(bio, chunk_sects -
+                                       sector_div(sector, chunk_sects));
                if (raid0_make_request(q, &bp->bio1))
                        generic_make_request(&bp->bio1);
                if (raid0_make_request(q, &bp->bio2))
@@ -443,34 +478,14 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
                bio_pair_release(bp);
                return 0;
        }
- 
-        {
-                sector_t x = sector >> conf->sector_shift;
-                sector_div(x, (u32)conf->spacing);
-                zone = conf->hash_table[x];
-        }
-        while (sector >= zone->zone_start + zone->sectors)
+        sector_offset = bio->bi_sector;
-                zone++;
+        zone =  find_zone(mddev->private, &sector_offset);
+        tmp_dev = map_sector(mddev, zone, bio->bi_sector,
-        sect_in_chunk = bio->bi_sector & (chunk_sects - 1);
+                             &sector_offset);
-        {
-                sector_t x = (sector - zone->zone_start) >> chunksect_bits;
-                sector_div(x, zone->nb_dev);
-                chunk = x;
-                x = sector >> chunksect_bits;
-                tmp_dev = zone->dev[sector_div(x, zone->nb_dev)];
-        }
-        rsect = (chunk << chunksect_bits) + zone->dev_start + sect_in_chunk;
- 
        bio->bi_bdev = tmp_dev->bdev;
-        bio->bi_sector = rsect + tmp_dev->data_offset;
+        bio->bi_sector = sector_offset + zone->dev_start +
+                tmp_dev->data_offset;
        /*
         * Let the main block layer submit the IO and resolve recursion:
         */
@@ -485,31 +500,35 @@ bad_map:
        return 0;
 }
-static void raid0_status (struct seq_file *seq, mddev_t *mddev)
+static void raid0_status(struct seq_file *seq, mddev_t *mddev)
 {
 #undef MD_DEBUG
 #ifdef MD_DEBUG
        int j, k, h;
        char b[BDEVNAME_SIZE];
-        raid0_conf_t *conf = mddev_to_conf(mddev);
+        raid0_conf_t *conf = mddev->private;
+        sector_t zone_size;
+        sector_t zone_start = 0;
        h = 0;
        for (j = 0; j < conf->nr_strip_zones; j++) {
                seq_printf(seq, "      z%d", j);
-                if (conf->hash_table[h] == conf->strip_zone+j)
-                        seq_printf(seq, "(h%d)", h++);
                seq_printf(seq, "=[");
                for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
                        seq_printf(seq, "%s/", bdevname(
-                                conf->strip_zone[j].dev[k]->bdev,b));
+                                conf->devlist[j*mddev->raid_disks + k]
+                                                ->bdev, b));
-                seq_printf(seq, "] zs=%d ds=%d s=%d\n",
-                                conf->strip_zone[j].zone_start,
+                zone_size  = conf->strip_zone[j].zone_end - zone_start;
-                                conf->strip_zone[j].dev_start,
+                seq_printf(seq, "] ze=%lld ds=%lld s=%lld\n",
-                                conf->strip_zone[j].sectors);
+                        (unsigned long long)zone_start>>1,
+                        (unsigned long long)conf->strip_zone[j].dev_start>>1,
+                        (unsigned long long)zone_size>>1);
+                zone_start = conf->strip_zone[j].zone_end;
        }
 #endif
-        seq_printf(seq, " %dk chunks", mddev->chunk_size/1024);
+        seq_printf(seq, " %dk chunks", mddev->chunk_sectors / 2);
        return;
 }
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 824b12eb1d4f..91f8e876ee64 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -3,26 +3,18 @@
 struct strip_zone
 {
-        sector_t zone_start;    /* Zone offset in md_dev (in sectors) */
+        sector_t zone_end;      /* Start of the next zone (in sectors) */
        sector_t dev_start;     /* Zone offset in real dev (in sectors) */
-        sector_t sectors;       /* Zone size in sectors */
        int nb_dev;             /* # of devices attached to the zone */
-        mdk_rdev_t **dev;       /* Devices attached to the zone */
 };
 struct raid0_private_data
 {
-        struct strip_zone **hash_table; /* Table of indexes into strip_zone */
        struct strip_zone *strip_zone;
        mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
        int nr_strip_zones;
-        sector_t spacing;
-        int sector_shift; /* shift this before divide by spacing */
 };
 typedef struct raid0_private_data raid0_conf_t;
-#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private)
 #endif
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 36df9109cde1..89939a7aef57 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -182,7 +182,7 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
 static void free_r1bio(r1bio_t *r1_bio)
 {
-        conf_t *conf = mddev_to_conf(r1_bio->mddev);
+        conf_t *conf = r1_bio->mddev->private;
        /*
         * Wake up any possible resync thread that waits for the device
@@ -196,7 +196,7 @@ static void free_r1bio(r1bio_t *r1_bio)
 static void put_buf(r1bio_t *r1_bio)
 {
-        conf_t *conf = mddev_to_conf(r1_bio->mddev);
+        conf_t *conf = r1_bio->mddev->private;
        int i;
        for (i=0; i<conf->raid_disks; i++) {
@@ -214,7 +214,7 @@ static void reschedule_retry(r1bio_t *r1_bio)
 {
        unsigned long flags;
        mddev_t *mddev = r1_bio->mddev;
-        conf_t *conf = mddev_to_conf(mddev);
+        conf_t *conf = mddev->private;
        spin_lock_irqsave(&conf->device_lock, flags);
        list_add(&r1_bio->retry_list, &conf->retry_list);
@@ -253,7 +253,7 @@ static void raid_end_bio_io(r1bio_t *r1_bio)
 */
 static inline void update_head_pos(int disk, r1bio_t *r1_bio)
 {
-        conf_t *conf = mddev_to_conf(r1_bio->mddev);
+        conf_t *conf = r1_bio->mddev->private;
        conf->mirrors[disk].head_position =
                r1_bio->sector + (r1_bio->sectors);
@@ -264,7 +264,7 @@ static void raid1_end_read_request(struct bio *bio, int error)
        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
        r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
        int mirror;
-        conf_t *conf = mddev_to_conf(r1_bio->mddev);
+        conf_t *conf = r1_bio->mddev->private;
        mirror = r1_bio->read_disk;
        /*
@@ -309,7 +309,7 @@ static void raid1_end_write_request(struct bio *bio, int error)
        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
        r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
        int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
-        conf_t *conf = mddev_to_conf(r1_bio->mddev);
+        conf_t *conf = r1_bio->mddev->private;
        struct bio *to_put = NULL;
@@ -541,7 +541,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 static void unplug_slaves(mddev_t *mddev)
 {
-        conf_t *conf = mddev_to_conf(mddev);
+        conf_t *conf = mddev->private;
        int i;
        rcu_read_lock();
@@ -573,7 +573,7 @@ static void raid1_unplug(struct request_queue *q)
 static int raid1_congested(void *data, int bits)
 {
        mddev_t *mddev = data;
-        conf_t *conf = mddev_to_conf(mddev);
+        conf_t *conf = mddev->private;
        int i, ret = 0;
        rcu_read_lock();
@@ -772,7 +772,7 @@ do_sync_io:
 static int make_request(struct request_queue *q, struct bio * bio)
 {
        mddev_t *mddev = q->queuedata;
-        conf_t *conf = mddev_to_conf(mddev);
+        conf_t *conf = mddev->private;
        mirror_info_t *mirror;
        r1bio_t *r1_bio;
        struct bio *read_bio;
@@ -991,7 +991,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
 static void status(struct seq_file *seq, mddev_t *mddev)
 {
-        conf_t *conf = mddev_to_conf(mddev);
+        conf_t *conf = mddev->private;
        int i;
        seq_printf(seq, " [%d/%d] [", conf->raid_disks,
@@ -1010,7 +1010,7 @@ static void status(struct seq_file *seq, mddev_t *mddev)
 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 {
        char b[BDEVNAME_SIZE];
-        conf_t *conf = mddev_to_conf(mddev);
+        conf_t *conf = mddev->private;
        /*
         * If it is not operational, then we have already marked it as dead
@@ -1130,7 +1130,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
                         * a one page request is never in violation.
                         */
                        if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-                            mddev->queue->max_sectors > (PAGE_SIZE>>9))
+                            queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
                                blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
                        p->head_position = 0;
@@ -1214,7 +1214,7 @@ static void end_sync_write(struct bio *bio, int error)
        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
        r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
        mddev_t *mddev = r1_bio->mddev;
-        conf_t *conf = mddev_to_conf(mddev);
+        conf_t *conf = mddev->private;
        int i;
        int mirror=0;
@@ -1248,7 +1248,7 @@ static void end_sync_write(struct bio *bio, int error)
 static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
 {
-        conf_t *conf = mddev_to_conf(mddev);
+        conf_t *conf = mddev->private;
        int i;
        int disks = conf->raid_disks;
        struct bio *bio, *wbio;
@@ -1562,7 +1562,7 @@ static void raid1d(mddev_t *mddev)
        r1bio_t *r1_bio;
        struct bio *bio;
        unsigned long flags;
-        conf_t *conf = mddev_to_conf(mddev);
+        conf_t *conf = mddev->private;
        struct list_head *head = &conf->retry_list;
        int unplug=0;
        mdk_rdev_t *rdev;
@@ -1585,7 +1585,7 @@ static void raid1d(mddev_t *mddev)
                spin_unlock_irqrestore(&conf->device_lock, flags);
                mddev = r1_bio->mddev;
-                conf = mddev_to_conf(mddev);
+                conf = mddev->private;
                if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
                        sync_request_write(mddev, r1_bio);
                        unplug = 1;
@@ -1706,7 +1706,7 @@ static int init_resync(conf_t *conf)
 static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
 {
-        conf_t *conf = mddev_to_conf(mddev);
+        conf_t *conf = mddev->private;
        r1bio_t *r1_bio;
        struct bio *bio;
        sector_t max_sector, nr_sectors;
@@ -1996,7 +1996,7 @@ static int run(mddev_t *mddev)
                 * a one page request is never in violation.
                 */
                if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-                    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+                    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
                        blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
                disk->head_position = 0;
@@ -2052,6 +2052,10 @@ static int run(mddev_t *mddev)
                goto out_free_conf;
        }
+        if (mddev->recovery_cp != MaxSector)
+                printk(KERN_NOTICE "raid1: %s is not clean"
+                       " -- starting background reconstruction\n",
+                       mdname(mddev));
        printk(KERN_INFO 
                "raid1: raid set %s active with %d out of %d mirrors\n",
                mdname(mddev), mddev->raid_disks - mddev->degraded, 
@@ -2087,7 +2091,7 @@ out:
 static int stop(mddev_t *mddev)
 {
-        conf_t *conf = mddev_to_conf(mddev);
+        conf_t *conf = mddev->private;
        struct bitmap *bitmap = mddev->bitmap;
        int behind_wait = 0;
@@ -2155,16 +2159,16 @@ static int raid1_reshape(mddev_t *mddev)
        mempool_t *newpool, *oldpool;
        struct pool_info *newpoolinfo;
        mirror_info_t *newmirrors;
-        conf_t *conf = mddev_to_conf(mddev);
+        conf_t *conf = mddev->private;
        int cnt, raid_disks;
        unsigned long flags;
        int d, d2, err;
        /* Cannot change chunk_size, layout, or level */
-        if (mddev->chunk_size != mddev->new_chunk ||
+        if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
            mddev->layout != mddev->new_layout ||
            mddev->level != mddev->new_level) {
-                mddev->new_chunk = mddev->chunk_size;
+                mddev->new_chunk_sectors = mddev->chunk_sectors;
                mddev->new_layout = mddev->layout;
                mddev->new_level = mddev->level;
                return -EINVAL;
@@ -2252,7 +2256,7 @@ static int raid1_reshape(mddev_t *mddev)
 static void raid1_quiesce(mddev_t *mddev, int state)
 {
-        conf_t *conf = mddev_to_conf(mddev);
+        conf_t *conf = mddev->private;
        switch(state) {
        case 1:
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 1620eea3d57c..e87b84deff68 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -64,12 +64,6 @@ struct r1_private_data_s {
 typedef struct r1_private_data_s conf_t;
 /*
- * this is the only point in the RAID code where we violate
- * C type safety. mddev->private is an 'opaque' pointer.
- */
-#define mddev_to_conf(mddev) ((conf_t *) mddev->private)
-/*
 * this is our 'private' RAID1 bio.
 *
 * it contains information about what kind of IO operations were started
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 499620afb44b..ae12ceafe10c 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -188,7 +188,7 @@ static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
 static void free_r10bio(r10bio_t *r10_bio)
 {
-        conf_t *conf = mddev_to_conf(r10_bio->mddev);
+        conf_t *conf = r10_bio->mddev->private;
        /*
         * Wake up any possible resync thread that waits for the device
@@ -202,7 +202,7 @@ static void free_r10bio(r10bio_t *r10_bio)
 static void put_buf(r10bio_t *r10_bio)
 {
-        conf_t *conf = mddev_to_conf(r10_bio->mddev);
+        conf_t *conf = r10_bio->mddev->private;
        mempool_free(r10_bio, conf->r10buf_pool);
@@ -213,7 +213,7 @@ static void reschedule_retry(r10bio_t *r10_bio)
 {
        unsigned long flags;
        mddev_t *mddev = r10_bio->mddev;
-        conf_t *conf = mddev_to_conf(mddev);
+        conf_t *conf = mddev->private;
        spin_lock_irqsave(&conf->device_lock, flags);
        list_add(&r10_bio->retry_list, &conf->retry_list);
@@ -245,7 +245,7 @@ static void raid_end_bio_io(r10bio_t *r10_bio)
 */
 static inline void update_head_pos(int slot, r10bio_t *r10_bio)
 {
-        conf_t *conf = mddev_to_conf(r10_bio->mddev);
+        conf_t *conf = r10_bio->mddev->private;
        conf->mirrors[r10_bio->devs[slot].devnum].head_position =
                r10_bio->devs[slot].addr + (r10_bio->sectors);
@@ -256,7 +256,7 @@ static void raid10_end_read_request(struct bio *bio, int error)
        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
        r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
        int slot, dev;
-        conf_t *conf = mddev_to_conf(r10_bio->mddev);
+        conf_t *conf = r10_bio->mddev->private;
        slot = r10_bio->read_slot;
@@ -297,7 +297,7 @@ static void raid10_end_write_request(struct bio *bio, int error)
        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
        r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
        int slot, dev;
-        conf_t *conf = mddev_to_conf(r10_bio->mddev);
+        conf_t *conf = r10_bio->mddev->private;
        for (slot = 0; slot < conf->copies; slot++)
                if (r10_bio->devs[slot].bio == bio)
@@ -461,7 +461,7 @@ static int raid10_mergeable_bvec(struct request_queue *q,
        mddev_t *mddev = q->queuedata;
        sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
        int max;
-        unsigned int chunk_sectors = mddev->chunk_size >> 9;
+        unsigned int chunk_sectors = mddev->chunk_sectors;
        unsigned int bio_sectors = bvm->bi_size >> 9;
        max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
@@ -596,7 +596,7 @@ rb_out:
 static void unplug_slaves(mddev_t *mddev)
 {
-        conf_t *conf = mddev_to_conf(mddev);
+        conf_t *conf = mddev->private;
        int i;
        rcu_read_lock();
@@ -628,7 +628,7 @@ static void raid10_unplug(struct request_queue *q)
 static int raid10_congested(void *data, int bits)
 {
        mddev_t *mddev = data;
-        conf_t *conf = mddev_to_conf(mddev);
+        conf_t *conf = mddev->private;
        int i, ret = 0;
        rcu_read_lock();
@@ -788,7 +788,7 @@ static void unfreeze_array(conf_t *conf)
 static int make_request(struct request_queue *q, struct bio * bio)
 {
        mddev_t *mddev = q->queuedata;
-        conf_t *conf = mddev_to_conf(mddev);
+        conf_t *conf = mddev->private;
        mirror_info_t *mirror;
        r10bio_t *r10_bio;
        struct bio *read_bio;
@@ -981,11 +981,11 @@ static int make_request(struct request_queue *q, struct bio * bio)
 static void status(struct seq_file *seq, mddev_t *mddev)
 {
-        conf_t *conf = mddev_to_conf(mddev);
+        conf_t *conf = mddev->private;
        int i;
        if (conf->near_copies < conf->raid_disks)
-                seq_printf(seq, " %dK chunks", mddev->chunk_size/1024);
+                seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
        if (conf->near_copies > 1)
                seq_printf(seq, " %d near-copies", conf->near_copies);
        if (conf->far_copies > 1) {
@@ -1006,7 +1006,7 @@ static void status(struct seq_file *seq, mddev_t *mddev)
 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 {
        char b[BDEVNAME_SIZE];
-        conf_t *conf = mddev_to_conf(mddev);
+        conf_t *conf = mddev->private;
        /*
         * If it is not operational, then we have already marked it as dead
@@ -1158,8 +1158,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
                         * a one page request is never in violation.
                         */
                        if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-                            mddev->queue->max_sectors > (PAGE_SIZE>>9))
+                            queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
-                                mddev->queue->max_sectors = (PAGE_SIZE>>9);
+                                blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
                        p->head_position = 0;
                        rdev->raid_disk = mirror;
@@ -1215,7 +1215,7 @@ abort:
 static void end_sync_read(struct bio *bio, int error)
 {
        r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
-        conf_t *conf = mddev_to_conf(r10_bio->mddev);
+        conf_t *conf = r10_bio->mddev->private;
        int i,d;
        for (i=0; i<conf->copies; i++)
@@ -1253,7 +1253,7 @@ static void end_sync_write(struct bio *bio, int error)
        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
        r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
        mddev_t *mddev = r10_bio->mddev;
-        conf_t *conf = mddev_to_conf(mddev);
+        conf_t *conf = mddev->private;
        int i,d;
        for (i = 0; i < conf->copies; i++)
@@ -1300,7 +1300,7 @@ static void end_sync_write(struct bio *bio, int error)
 */
 static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
 {
-        conf_t *conf = mddev_to_conf(mddev);
+        conf_t *conf = mddev->private;
        int i, first;
        struct bio *tbio, *fbio;
@@ -1400,7 +1400,7 @@ done:
 static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
 {
-        conf_t *conf = mddev_to_conf(mddev);
+        conf_t *conf = mddev->private;
        int i, d;
        struct bio *bio, *wbio;
@@ -1549,7 +1549,7 @@ static void raid10d(mddev_t *mddev)
        r10bio_t *r10_bio;
        struct bio *bio;
        unsigned long flags;
-        conf_t *conf = mddev_to_conf(mddev);
+        conf_t *conf = mddev->private;
        struct list_head *head = &conf->retry_list;
        int unplug=0;
        mdk_rdev_t *rdev;
@@ -1572,7 +1572,7 @@ static void raid10d(mddev_t *mddev)
                spin_unlock_irqrestore(&conf->device_lock, flags);
                mddev = r10_bio->mddev;
-                conf = mddev_to_conf(mddev);
+                conf = mddev->private;
                if (test_bit(R10BIO_IsSync, &r10_bio->state)) {
                        sync_request_write(mddev, r10_bio);
                        unplug = 1;
@@ -1680,7 +1680,7 @@ static int init_resync(conf_t *conf)
 static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
 {
-        conf_t *conf = mddev_to_conf(mddev);
+        conf_t *conf = mddev->private;
        r10bio_t *r10_bio;
        struct bio *biolist = NULL, *bio;
        sector_t max_sector, nr_sectors;
@@ -2026,7 +2026,7 @@ static sector_t
 raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 {
        sector_t size;
-        conf_t *conf = mddev_to_conf(mddev);
+        conf_t *conf = mddev->private;
        if (!raid_disks)
                raid_disks = mddev->raid_disks;
@@ -2050,9 +2050,10 @@ static int run(mddev_t *mddev)
        int nc, fc, fo;
        sector_t stride, size;
-        if (mddev->chunk_size < PAGE_SIZE) {
+        if (mddev->chunk_sectors < (PAGE_SIZE >> 9) ||
+            !is_power_of_2(mddev->chunk_sectors)) {
                printk(KERN_ERR "md/raid10: chunk size must be "
-                       "at least PAGE_SIZE(%ld).\n", PAGE_SIZE);
+                       "at least PAGE_SIZE(%ld) and be a power of 2.\n", PAGE_SIZE);
                return -EINVAL;
        }
@@ -2095,8 +2096,8 @@ static int run(mddev_t *mddev)
        conf->far_copies = fc;
        conf->copies = nc*fc;
        conf->far_offset = fo;
-        conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1;
+        conf->chunk_mask = mddev->chunk_sectors - 1;
-        conf->chunk_shift = ffz(~mddev->chunk_size) - 9;
+        conf->chunk_shift = ffz(~mddev->chunk_sectors);
        size = mddev->dev_sectors >> conf->chunk_shift;
        sector_div(size, fc);
        size = size * conf->raid_disks;
@@ -2145,8 +2146,8 @@ static int run(mddev_t *mddev)
                 * a one page request is never in violation.
                 */
                if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-                    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+                    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
-                        mddev->queue->max_sectors = (PAGE_SIZE>>9);
+                        blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
                disk->head_position = 0;
        }
@@ -2185,6 +2186,10 @@ static int run(mddev_t *mddev)
                goto out_free_conf;
        }
+        if (mddev->recovery_cp != MaxSector)
+                printk(KERN_NOTICE "raid10: %s is not clean"
+                       " -- starting background reconstruction\n",
+                       mdname(mddev));
        printk(KERN_INFO
                "raid10: raid set %s active with %d out of %d devices\n",
                mdname(mddev), mddev->raid_disks - mddev->degraded,
@@ -2204,7 +2209,8 @@ static int run(mddev_t *mddev)
         * maybe...
         */
        {
-                int stripe = conf->raid_disks * (mddev->chunk_size / PAGE_SIZE);
+                int stripe = conf->raid_disks *
+                        ((mddev->chunk_sectors << 9) / PAGE_SIZE);
                stripe /= conf->near_copies;
                if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
                        mddev->queue->backing_dev_info.ra_pages = 2* stripe;
@@ -2227,7 +2233,7 @@ out:
 static int stop(mddev_t *mddev)
 {
-        conf_t *conf = mddev_to_conf(mddev);
+        conf_t *conf = mddev->private;
        raise_barrier(conf, 0);
        lower_barrier(conf);
@@ -2245,7 +2251,7 @@ static int stop(mddev_t *mddev)
 static void raid10_quiesce(mddev_t *mddev, int state)
 {
-        conf_t *conf = mddev_to_conf(mddev);
+        conf_t *conf = mddev->private;
        switch(state) {
        case 1:
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 244dbe507a54..59cd1efb8d30 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -62,12 +62,6 @@ struct r10_private_data_s {
 typedef struct r10_private_data_s conf_t;
 /*
- * this is the only point in the RAID code where we violate
- * C type safety. mddev->private is an 'opaque' pointer.
- */
-#define mddev_to_conf(mddev) ((conf_t *) mddev->private)
-/*
 * this is our 'private' RAID10 bio.
 *
 * it contains information about what kind of IO operations were started
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 54ef8d75541d..cac6f4d3a143 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1617,8 +1617,8 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
        sector_t new_sector;
        int algorithm = previous ? conf->prev_algo
                                 : conf->algorithm;
-        int sectors_per_chunk = previous ? (conf->prev_chunk >> 9)
+        int sectors_per_chunk = previous ? conf->prev_chunk_sectors
-                                         : (conf->chunk_size >> 9);
+                                         : conf->chunk_sectors;
        int raid_disks = previous ? conf->previous_raid_disks
                                  : conf->raid_disks;
        int data_disks = raid_disks - conf->max_degraded;
@@ -1823,8 +1823,8 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
        int raid_disks = sh->disks;
        int data_disks = raid_disks - conf->max_degraded;
        sector_t new_sector = sh->sector, check;
-        int sectors_per_chunk = previous ? (conf->prev_chunk >> 9)
+        int sectors_per_chunk = previous ? conf->prev_chunk_sectors
-                                         : (conf->chunk_size >> 9);
+                                         : conf->chunk_sectors;
        int algorithm = previous ? conf->prev_algo
                                 : conf->algorithm;
        sector_t stripe;
@@ -2098,8 +2098,7 @@ static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
                            struct stripe_head *sh)
 {
        int sectors_per_chunk =
-                previous ? (conf->prev_chunk >> 9)
+                previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
-                         : (conf->chunk_size >> 9);
        int dd_idx;
        int chunk_offset = sector_div(stripe, sectors_per_chunk);
        int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
@@ -3496,7 +3495,7 @@ static void activate_bit_delay(raid5_conf_t *conf)
 static void unplug_slaves(mddev_t *mddev)
 {
-        raid5_conf_t *conf = mddev_to_conf(mddev);
+        raid5_conf_t *conf = mddev->private;
        int i;
        rcu_read_lock();
@@ -3520,7 +3519,7 @@ static void unplug_slaves(mddev_t *mddev)
 static void raid5_unplug_device(struct request_queue *q)
 {
        mddev_t *mddev = q->queuedata;
-        raid5_conf_t *conf = mddev_to_conf(mddev);
+        raid5_conf_t *conf = mddev->private;
        unsigned long flags;
        spin_lock_irqsave(&conf->device_lock, flags);
@@ -3539,7 +3538,7 @@ static void raid5_unplug_device(struct request_queue *q)
 static int raid5_congested(void *data, int bits)
 {
        mddev_t *mddev = data;
-        raid5_conf_t *conf = mddev_to_conf(mddev);
+        raid5_conf_t *conf = mddev->private;
        /* No difference between reads and writes.  Just check
         * how busy the stripe_cache is
@@ -3564,14 +3563,14 @@ static int raid5_mergeable_bvec(struct request_queue *q,
        mddev_t *mddev = q->queuedata;
        sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
        int max;
-        unsigned int chunk_sectors = mddev->chunk_size >> 9;
+        unsigned int chunk_sectors = mddev->chunk_sectors;
        unsigned int bio_sectors = bvm->bi_size >> 9;
        if ((bvm->bi_rw & 1) == WRITE)
                return biovec->bv_len; /* always allow writes to be mergeable */
-        if (mddev->new_chunk < mddev->chunk_size)
+        if (mddev->new_chunk_sectors < mddev->chunk_sectors)
-                chunk_sectors = mddev->new_chunk >> 9;
+                chunk_sectors = mddev->new_chunk_sectors;
        max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
        if (max < 0) max = 0;
        if (max <= biovec->bv_len && bio_sectors == 0)
@@ -3584,11 +3583,11 @@ static int raid5_mergeable_bvec(struct request_queue *q,
 static int in_chunk_boundary(mddev_t *mddev, struct bio *bio)
 {
        sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
-        unsigned int chunk_sectors = mddev->chunk_size >> 9;
+        unsigned int chunk_sectors = mddev->chunk_sectors;
        unsigned int bio_sectors = bio->bi_size >> 9;
-        if (mddev->new_chunk < mddev->chunk_size)
+        if (mddev->new_chunk_sectors < mddev->chunk_sectors)
-                chunk_sectors = mddev->new_chunk >> 9;
+                chunk_sectors = mddev->new_chunk_sectors;
        return  chunk_sectors >=
                ((sector & (chunk_sectors - 1)) + bio_sectors);
 }
@@ -3652,7 +3651,7 @@ static void raid5_align_endio(struct bio *bi, int error)
        bio_put(bi);
        mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata;
-        conf = mddev_to_conf(mddev);
+        conf = mddev->private;
        rdev = (void*)raid_bi->bi_next;
        raid_bi->bi_next = NULL;
@@ -3675,10 +3674,10 @@ static int bio_fits_rdev(struct bio *bi)
 {
        struct request_queue *q = bdev_get_queue(bi->bi_bdev);
-        if ((bi->bi_size>>9) > q->max_sectors)
+        if ((bi->bi_size>>9) > queue_max_sectors(q))
                return 0;
        blk_recount_segments(q, bi);
-        if (bi->bi_phys_segments > q->max_phys_segments)
+        if (bi->bi_phys_segments > queue_max_phys_segments(q))
                return 0;
        if (q->merge_bvec_fn)
@@ -3694,7 +3693,7 @@ static int bio_fits_rdev(struct bio *bi)
 static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
 {
        mddev_t *mddev = q->queuedata;
-        raid5_conf_t *conf = mddev_to_conf(mddev);
+        raid5_conf_t *conf = mddev->private;
        unsigned int dd_idx;
        struct bio* align_bi;
        mdk_rdev_t *rdev;
@@ -3811,7 +3810,7 @@ static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf)
 static int make_request(struct request_queue *q, struct bio * bi)
 {
        mddev_t *mddev = q->queuedata;
-        raid5_conf_t *conf = mddev_to_conf(mddev);
+        raid5_conf_t *conf = mddev->private;
        int dd_idx;
        sector_t new_sector;
        sector_t logical_sector, last_sector;
@@ -3908,6 +3907,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
                                spin_unlock_irq(&conf->device_lock);
                                if (must_retry) {
                                        release_stripe(sh);
+                                        schedule();
                                        goto retry;
                                }
                        }
@@ -4003,10 +4003,10 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
         * If old and new chunk sizes differ, we need to process the
         * largest of these
         */
-        if (mddev->new_chunk > mddev->chunk_size)
+        if (mddev->new_chunk_sectors > mddev->chunk_sectors)
-                reshape_sectors = mddev->new_chunk / 512;
+                reshape_sectors = mddev->new_chunk_sectors;
        else
-                reshape_sectors = mddev->chunk_size / 512;
+                reshape_sectors = mddev->chunk_sectors;
        /* we update the metadata when there is more than 3Meg
         * in the block range (that is rather arbitrary, should
@@ -4129,7 +4129,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
                                     1, &dd_idx, NULL);
        last_sector =
                raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
-                                            *(new_data_disks) - 1),
+                                            * new_data_disks - 1),
                                     1, &dd_idx, NULL);
        if (last_sector >= mddev->dev_sectors)
                last_sector = mddev->dev_sectors - 1;
@@ -4158,7 +4158,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
                wait_event(conf->wait_for_overlap,
                           atomic_read(&conf->reshape_stripes) == 0);
                mddev->reshape_position = conf->reshape_progress;
-                mddev->curr_resync_completed = mddev->curr_resync;
+                mddev->curr_resync_completed = mddev->curr_resync + reshape_sectors;
                conf->reshape_checkpoint = jiffies;
                set_bit(MD_CHANGE_DEVS, &mddev->flags);
                md_wakeup_thread(mddev->thread);
@@ -4371,7 +4371,7 @@ static void synchronize_stripe_processing(struct list_head *domain)
 static void raid5d(mddev_t *mddev)
 {
        struct stripe_head *sh;
-        raid5_conf_t *conf = mddev_to_conf(mddev);
+        raid5_conf_t *conf = mddev->private;
        int handled;
        LIST_HEAD(raid_domain);
@@ -4428,7 +4428,7 @@ static void raid5d(mddev_t *mddev)
 static ssize_t
 raid5_show_stripe_cache_size(mddev_t *mddev, char *page)
 {
-        raid5_conf_t *conf = mddev_to_conf(mddev);
+        raid5_conf_t *conf = mddev->private;
        if (conf)
                return sprintf(page, "%d\n", conf->max_nr_stripes);
        else
@@ -4438,7 +4438,7 @@ raid5_show_stripe_cache_size(mddev_t *mddev, char *page)
 static ssize_t
 raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
 {
-        raid5_conf_t *conf = mddev_to_conf(mddev);
+        raid5_conf_t *conf = mddev->private;
        unsigned long new;
        int err;
@@ -4476,7 +4476,7 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
 static ssize_t
 raid5_show_preread_threshold(mddev_t *mddev, char *page)
 {
-        raid5_conf_t *conf = mddev_to_conf(mddev);
+        raid5_conf_t *conf = mddev->private;
        if (conf)
                return sprintf(page, "%d\n", conf->bypass_threshold);
        else
@@ -4486,7 +4486,7 @@ raid5_show_preread_threshold(mddev_t *mddev, char *page)
 static ssize_t
 raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len)
 {
-        raid5_conf_t *conf = mddev_to_conf(mddev);
+        raid5_conf_t *conf = mddev->private;
        unsigned long new;
        if (len >= PAGE_SIZE)
                return -EINVAL;
@@ -4510,7 +4510,7 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
 static ssize_t
 stripe_cache_active_show(mddev_t *mddev, char *page)
 {
-        raid5_conf_t *conf = mddev_to_conf(mddev);
+        raid5_conf_t *conf = mddev->private;
        if (conf)
                return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
        else
@@ -4534,7 +4534,7 @@ static struct attribute_group raid5_attrs_group = {
 static sector_t
 raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 {
-        raid5_conf_t *conf = mddev_to_conf(mddev);
+        raid5_conf_t *conf = mddev->private;
        if (!sectors)
                sectors = mddev->dev_sectors;
@@ -4546,8 +4546,8 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
                        raid_disks = conf->previous_raid_disks;
        }
-        sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
+        sectors &= ~((sector_t)mddev->chunk_sectors - 1);
-        sectors &= ~((sector_t)mddev->new_chunk/512 - 1);
+        sectors &= ~((sector_t)mddev->new_chunk_sectors - 1);
        return sectors * (raid_disks - conf->max_degraded);
 }
@@ -4691,9 +4691,11 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
                return ERR_PTR(-EINVAL);
        }
-        if (!mddev->new_chunk || mddev->new_chunk % PAGE_SIZE) {
+        if (!mddev->new_chunk_sectors ||
+            (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
+            !is_power_of_2(mddev->new_chunk_sectors)) {
                printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
-                        mddev->new_chunk, mdname(mddev));
+                       mddev->new_chunk_sectors << 9, mdname(mddev));
                return ERR_PTR(-EINVAL);
        }
@@ -4756,7 +4758,8 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
                        conf->fullsync = 1;
        }
-        conf->chunk_size = mddev->new_chunk;
+        conf->chunk_sectors = mddev->new_chunk_sectors;
+        conf->level = mddev->new_level;
        if (conf->level == 6)
                conf->max_degraded = 2;
        else
@@ -4765,7 +4768,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
        conf->max_nr_stripes = NR_STRIPES;
        conf->reshape_progress = mddev->reshape_position;
        if (conf->reshape_progress != MaxSector) {
-                conf->prev_chunk = mddev->chunk_size;
+                conf->prev_chunk_sectors = mddev->chunk_sectors;
                conf->prev_algo = mddev->layout;
        }
@@ -4803,6 +4806,10 @@ static int run(mddev_t *mddev)
        int working_disks = 0;
        mdk_rdev_t *rdev;
+        if (mddev->recovery_cp != MaxSector)
+                printk(KERN_NOTICE "raid5: %s is not clean"
+                       " -- starting background reconstruction\n",
+                       mdname(mddev));
        if (mddev->reshape_position != MaxSector) {
                /* Check that we can continue the reshape.
                 * Currently only disks can change, it must
@@ -4825,7 +4832,7 @@ static int run(mddev_t *mddev)
                 * geometry.
                 */
                here_new = mddev->reshape_position;
-                if (sector_div(here_new, (mddev->new_chunk>>9)*
+                if (sector_div(here_new, mddev->new_chunk_sectors *
                               (mddev->raid_disks - max_degraded))) {
                        printk(KERN_ERR "raid5: reshape_position not "
                               "on a stripe boundary\n");
@@ -4833,7 +4840,7 @@ static int run(mddev_t *mddev)
                }
                /* here_new is the stripe we will write to */
                here_old = mddev->reshape_position;
-                sector_div(here_old, (mddev->chunk_size>>9)*
+                sector_div(here_old, mddev->chunk_sectors *
                           (old_disks-max_degraded));
                /* here_old is the first stripe that we might need to read
                 * from */
@@ -4848,7 +4855,7 @@ static int run(mddev_t *mddev)
        } else {
                BUG_ON(mddev->level != mddev->new_level);
                BUG_ON(mddev->layout != mddev->new_layout);
-                BUG_ON(mddev->chunk_size != mddev->new_chunk);
+                BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
                BUG_ON(mddev->delta_disks != 0);
        }
@@ -4882,7 +4889,7 @@ static int run(mddev_t *mddev)
        }
        /* device size must be a multiple of chunk size */
-        mddev->dev_sectors &= ~(mddev->chunk_size / 512 - 1);
+        mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
        mddev->resync_max_sectors = mddev->dev_sectors;
        if (mddev->degraded > 0 &&
@@ -4931,7 +4938,7 @@ static int run(mddev_t *mddev)
        {
                int data_disks = conf->previous_raid_disks - conf->max_degraded;
                int stripe = data_disks *
-                        (mddev->chunk_size / PAGE_SIZE);
+                        ((mddev->chunk_sectors << 9) / PAGE_SIZE);
                if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
                        mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
        }
@@ -5021,7 +5028,8 @@ static void status(struct seq_file *seq, mddev_t *mddev)
        raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
        int i;
-        seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout);
+        seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
+                mddev->chunk_sectors / 2, mddev->layout);
        seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
        for (i = 0; i < conf->raid_disks; i++)
                seq_printf (seq, "%s",
@@ -5169,7 +5177,7 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
         * any io in the removed space completes, but it hardly seems
         * worth it.
         */
-        sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
+        sectors &= ~((sector_t)mddev->chunk_sectors - 1);
        md_set_array_sectors(mddev, raid5_size(mddev, sectors,
                                               mddev->raid_disks));
        if (mddev->array_sectors >
@@ -5186,14 +5194,37 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
        return 0;
 }
-static int raid5_check_reshape(mddev_t *mddev)
+static int check_stripe_cache(mddev_t *mddev)
+{
+        /* Can only proceed if there are plenty of stripe_heads.
+         * We need a minimum of one full stripe,, and for sensible progress
+         * it is best to have about 4 times that.
+         * If we require 4 times, then the default 256 4K stripe_heads will
+         * allow for chunk sizes up to 256K, which is probably OK.
+         * If the chunk size is greater, user-space should request more
+         * stripe_heads first.
+         */
+        raid5_conf_t *conf = mddev->private;
+        if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
+            > conf->max_nr_stripes ||
+            ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
+            > conf->max_nr_stripes) {
+                printk(KERN_WARNING "raid5: reshape: not enough stripes.  Needed %lu\n",
+                       ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
+                        / STRIPE_SIZE)*4);
+                return 0;
+        }
+        return 1;
+}
+static int check_reshape(mddev_t *mddev)
 {
-        raid5_conf_t *conf = mddev_to_conf(mddev);
+        raid5_conf_t *conf = mddev->private;
        if (mddev->delta_disks == 0 &&
            mddev->new_layout == mddev->layout &&
-            mddev->new_chunk == mddev->chunk_size)
+            mddev->new_chunk_sectors == mddev->chunk_sectors)
-                return -EINVAL; /* nothing to do */
+                return 0; /* nothing to do */
        if (mddev->bitmap)
                /* Cannot grow a bitmap yet */
                return -EBUSY;
@@ -5212,28 +5243,15 @@ static int raid5_check_reshape(mddev_t *mddev)
                        return -EINVAL;
        }
-        /* Can only proceed if there are plenty of stripe_heads.
+        if (!check_stripe_cache(mddev))
-         * We need a minimum of one full stripe,, and for sensible progress
-         * it is best to have about 4 times that.
-         * If we require 4 times, then the default 256 4K stripe_heads will
-         * allow for chunk sizes up to 256K, which is probably OK.
-         * If the chunk size is greater, user-space should request more
-         * stripe_heads first.
-         */
-        if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes ||
-            (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) {
-                printk(KERN_WARNING "raid5: reshape: not enough stripes.  Needed %lu\n",
-                       (max(mddev->chunk_size, mddev->new_chunk)
-                        / STRIPE_SIZE)*4);
                return -ENOSPC;
-        }
        return resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
 }
 static int raid5_start_reshape(mddev_t *mddev)
 {
-        raid5_conf_t *conf = mddev_to_conf(mddev);
+        raid5_conf_t *conf = mddev->private;
        mdk_rdev_t *rdev;
        int spares = 0;
        int added_devices = 0;
@@ -5242,6 +5260,9 @@ static int raid5_start_reshape(mddev_t *mddev)
        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
                return -EBUSY;
+        if (!check_stripe_cache(mddev))
+                return -ENOSPC;
        list_for_each_entry(rdev, &mddev->disks, same_set)
                if (rdev->raid_disk < 0 &&
                    !test_bit(Faulty, &rdev->flags))
@@ -5268,8 +5289,8 @@ static int raid5_start_reshape(mddev_t *mddev)
        spin_lock_irq(&conf->device_lock);
        conf->previous_raid_disks = conf->raid_disks;
        conf->raid_disks += mddev->delta_disks;
-        conf->prev_chunk = conf->chunk_size;
+        conf->prev_chunk_sectors = conf->chunk_sectors;
-        conf->chunk_size = mddev->new_chunk;
+        conf->chunk_sectors = mddev->new_chunk_sectors;
        conf->prev_algo = conf->algorithm;
        conf->algorithm = mddev->new_layout;
        if (mddev->delta_disks < 0)
@@ -5351,7 +5372,7 @@ static void end_reshape(raid5_conf_t *conf)
                 */
                {
                        int data_disks = conf->raid_disks - conf->max_degraded;
-                        int stripe = data_disks * (conf->chunk_size
+                        int stripe = data_disks * ((conf->chunk_sectors << 9)
                                                   / PAGE_SIZE);
                        if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
                                conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
@@ -5365,7 +5386,7 @@ static void end_reshape(raid5_conf_t *conf)
 static void raid5_finish_reshape(mddev_t *mddev)
 {
        struct block_device *bdev;
-        raid5_conf_t *conf = mddev_to_conf(mddev);
+        raid5_conf_t *conf = mddev->private;
        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
@@ -5396,7 +5417,7 @@ static void raid5_finish_reshape(mddev_t *mddev)
                                raid5_remove_disk(mddev, d);
                }
                mddev->layout = conf->algorithm;
-                mddev->chunk_size = conf->chunk_size;
+                mddev->chunk_sectors = conf->chunk_sectors;
                mddev->reshape_position = MaxSector;
                mddev->delta_disks = 0;
        }
@@ -5404,7 +5425,7 @@ static void raid5_finish_reshape(mddev_t *mddev)
 static void raid5_quiesce(mddev_t *mddev, int state)
 {
-        raid5_conf_t *conf = mddev_to_conf(mddev);
+        raid5_conf_t *conf = mddev->private;
        switch(state) {
        case 2: /* resume for a suspend */
@@ -5454,7 +5475,7 @@ static void *raid5_takeover_raid1(mddev_t *mddev)
        mddev->new_level = 5;
        mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
-        mddev->new_chunk = chunksect << 9;
+        mddev->new_chunk_sectors = chunksect;
        return setup_conf(mddev);
 }
@@ -5493,24 +5514,24 @@ static void *raid5_takeover_raid6(mddev_t *mddev)
 }
-static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
+static int raid5_check_reshape(mddev_t *mddev)
 {
        /* For a 2-drive array, the layout and chunk size can be changed
         * immediately as not restriping is needed.
         * For larger arrays we record the new value - after validation
         * to be used by a reshape pass.
         */
-        raid5_conf_t *conf = mddev_to_conf(mddev);
+        raid5_conf_t *conf = mddev->private;
+        int new_chunk = mddev->new_chunk_sectors;
-        if (new_layout >= 0 && !algorithm_valid_raid5(new_layout))
+        if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout))
                return -EINVAL;
        if (new_chunk > 0) {
-                if (new_chunk & (new_chunk-1))
+                if (!is_power_of_2(new_chunk))
-                        /* not a power of 2 */
                        return -EINVAL;
-                if (new_chunk < PAGE_SIZE)
+                if (new_chunk < (PAGE_SIZE>>9))
                        return -EINVAL;
-                if (mddev->array_sectors & ((new_chunk>>9)-1))
+                if (mddev->array_sectors & (new_chunk-1))
                        /* not factor of array size */
                        return -EINVAL;
        }
@@ -5518,49 +5539,39 @@ static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
        /* They look valid */
        if (mddev->raid_disks == 2) {
+                /* can make the change immediately */
-                if (new_layout >= 0) {
+                if (mddev->new_layout >= 0) {
-                        conf->algorithm = new_layout;
+                        conf->algorithm = mddev->new_layout;
-                        mddev->layout = mddev->new_layout = new_layout;
+                        mddev->layout = mddev->new_layout;
                }
                if (new_chunk > 0) {
-                        conf->chunk_size = new_chunk;
+                        conf->chunk_sectors = new_chunk ;
-                        mddev->chunk_size = mddev->new_chunk = new_chunk;
+                        mddev->chunk_sectors = new_chunk;
                }
                set_bit(MD_CHANGE_DEVS, &mddev->flags);
                md_wakeup_thread(mddev->thread);
-        } else {
-                if (new_layout >= 0)
-                        mddev->new_layout = new_layout;
-                if (new_chunk > 0)
-                        mddev->new_chunk = new_chunk;
        }
-        return 0;
+        return check_reshape(mddev);
 }
-static int raid6_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
+static int raid6_check_reshape(mddev_t *mddev)
 {
-        if (new_layout >= 0 && !algorithm_valid_raid6(new_layout))
+        int new_chunk = mddev->new_chunk_sectors;
+        if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout))
                return -EINVAL;
        if (new_chunk > 0) {
-                if (new_chunk & (new_chunk-1))
+                if (!is_power_of_2(new_chunk))
-                        /* not a power of 2 */
                        return -EINVAL;
-                if (new_chunk < PAGE_SIZE)
+                if (new_chunk < (PAGE_SIZE >> 9))
                        return -EINVAL;
-                if (mddev->array_sectors & ((new_chunk>>9)-1))
+                if (mddev->array_sectors & (new_chunk-1))
                        /* not factor of array size */
                        return -EINVAL;
        }
        /* They look valid */
+        return check_reshape(mddev);
-        if (new_layout >= 0)
-                mddev->new_layout = new_layout;
-        if (new_chunk > 0)
-                mddev->new_chunk = new_chunk;
-        return 0;
 }
 static void *raid5_takeover(mddev_t *mddev)
@@ -5570,8 +5581,6 @@ static void *raid5_takeover(mddev_t *mddev)
         *  raid1 - if there are two drives.  We need to know the chunk size
         *  raid4 - trivial - just use a raid4 layout.
         *  raid6 - Providing it is a *_6 layout
-         *
-         * For now, just do raid1
         */
        if (mddev->level == 1)
@@ -5653,12 +5662,11 @@ static struct mdk_personality raid6_personality =
        .sync_request   = sync_request,
        .resize         = raid5_resize,
        .size           = raid5_size,
-        .check_reshape  = raid5_check_reshape,
+        .check_reshape  = raid6_check_reshape,
        .start_reshape  = raid5_start_reshape,
        .finish_reshape = raid5_finish_reshape,
        .quiesce        = raid5_quiesce,
        .takeover       = raid6_takeover,
-        .reconfig       = raid6_reconfig,
 };
 static struct mdk_personality raid5_personality =
 {
@@ -5681,7 +5689,6 @@ static struct mdk_personality raid5_personality =
        .finish_reshape = raid5_finish_reshape,
        .quiesce        = raid5_quiesce,
        .takeover       = raid5_takeover,
-        .reconfig       = raid5_reconfig,
 };
 static struct mdk_personality raid4_personality =
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 116d0b44b2a9..2390e0e83daf 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -337,7 +337,8 @@ struct raid5_private_data {
        struct hlist_head       *stripe_hashtbl;
        mddev_t                 *mddev;
        struct disk_info        *spare;
-        int                     chunk_size, level, algorithm;
+        int                     chunk_sectors;
+        int                     level, algorithm;
        int                     max_degraded;
        int                     raid_disks;
        int                     max_nr_stripes;
@@ -353,7 +354,8 @@ struct raid5_private_data {
         */
        sector_t                reshape_safe;
        int                     previous_raid_disks;
-        int                     prev_chunk, prev_algo;
+        int                     prev_chunk_sectors;
+        int                     prev_algo;
        short                   generation; /* increments with every reshape */
        unsigned long           reshape_checkpoint; /* Time we last updated
                                                     * metadata */
@@ -424,8 +426,6 @@ struct raid5_private_data {
 typedef struct raid5_private_data raid5_conf_t;
-#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)
 /*
 * Our supported algorithms
 */
author	Dan Williams <dan.j.williams@intel.com>	2009-09-08 20:55:21 -0400
committer	Dan Williams <dan.j.williams@intel.com>	2009-09-08 20:55:21 -0400
commit	bbb20089a3275a19e475dbc21320c3742e3ca423 (patch)
tree	216fdc1cbef450ca688135c5b8969169482d9a48 /drivers/md
parent	3e48e656903e9fd8bc805c6a2c4264d7808d315b (diff)
parent	657a77fa7284d8ae28dfa48f1dc5d919bf5b2843 (diff)