35 files changed, 3993 insertions, 435 deletions
diff --git a/Documentation/device-mapper/dm-log.txt b/Documentation/device-mapper/dm-log.txt
new file mode 100644
index 000000000000..994dd75475a6
--- /dev/null
+++ b/Documentation/device-mapper/dm-log.txt
@@ -0,0 +1,54 @@
+Device-Mapper Logging
+=====================
+The device-mapper logging code is used by some of the device-mapper
+RAID targets to track regions of the disk that are not consistent.
+A region (or portion of the address space) of the disk may be
+inconsistent because a RAID stripe is currently being operated on or
+a machine died while the region was being altered.  In the case of
+mirrors, a region would be considered dirty/inconsistent while you
+are writing to it because the writes need to be replicated for all
+the legs of the mirror and may not reach the legs at the same time.
+Once all writes are complete, the region is considered clean again.
+There is a generic logging interface that the device-mapper RAID
+implementations use to perform logging operations (see
+dm_dirty_log_type in include/linux/dm-dirty-log.h).  Various different
+logging implementations are available and provide different
+capabilities.  The list includes:
+Type            Files
+====            =====
+disk            drivers/md/dm-log.c
+core            drivers/md/dm-log.c
+userspace       drivers/md/dm-log-userspace* include/linux/dm-log-userspace.h
+The "disk" log type
+-------------------
+This log implementation commits the log state to disk.  This way, the
+logging state survives reboots/crashes.
+The "core" log type
+-------------------
+This log implementation keeps the log state in memory.  The log state
+will not survive a reboot or crash, but there may be a small boost in
+performance.  This method can also be used if no storage device is
+available for storing log state.
+The "userspace" log type
+------------------------
+This log type simply provides a way to export the log API to userspace,
+so log implementations can be done there.  This is done by forwarding most
+logging requests to userspace, where a daemon receives and processes the
+request.
+The structure used for communication between kernel and userspace are
+located in include/linux/dm-log-userspace.h.  Due to the frequency,
+diversity, and 2-way communication nature of the exchanges between
+kernel and userspace, 'connector' is used as the interface for
+communication.
+There are currently two userspace log implementations that leverage this
+framework - "clustered_disk" and "clustered_core".  These implementations
+provide a cluster-coherent log for shared-storage.  Device-mapper mirroring
+can be used in a shared-storage environment when the cluster log implementations
+are employed.
diff --git a/Documentation/device-mapper/dm-queue-length.txt b/Documentation/device-mapper/dm-queue-length.txt
new file mode 100644
index 000000000000..f4db2562175c
--- /dev/null
+++ b/Documentation/device-mapper/dm-queue-length.txt
@@ -0,0 +1,39 @@
+dm-queue-length
+===============
+dm-queue-length is a path selector module for device-mapper targets,
+which selects a path with the least number of in-flight I/Os.
+The path selector name is 'queue-length'.
+Table parameters for each path: [<repeat_count>]
+        <repeat_count>: The number of I/Os to dispatch using the selected
+                        path before switching to the next path.
+                        If not given, internal default is used. To check
+                        the default value, see the activated table.
+Status for each path: <status> <fail-count> <in-flight>
+        <status>: 'A' if the path is active, 'F' if the path is failed.
+        <fail-count>: The number of path failures.
+        <in-flight>: The number of in-flight I/Os on the path.
+Algorithm
+=========
+dm-queue-length increments/decrements 'in-flight' when an I/O is
+dispatched/completed respectively.
+dm-queue-length selects a path with the minimum 'in-flight'.
+Examples
+========
+In case that 2 paths (sda and sdb) are used with repeat_count == 128.
+# echo "0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128" \
+  dmsetup create test
+#
+# dmsetup table
+test: 0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128
+#
+# dmsetup status
+test: 0 10 multipath 2 0 0 0 1 1 E 0 2 1 8:0 A 0 0 8:16 A 0 0
diff --git a/Documentation/device-mapper/dm-service-time.txt b/Documentation/device-mapper/dm-service-time.txt
new file mode 100644
index 000000000000..7d00668e97bb
--- /dev/null
+++ b/Documentation/device-mapper/dm-service-time.txt
@@ -0,0 +1,91 @@
+dm-service-time
+===============
+dm-service-time is a path selector module for device-mapper targets,
+which selects a path with the shortest estimated service time for
+the incoming I/O.
+The service time for each path is estimated by dividing the total size
+of in-flight I/Os on a path with the performance value of the path.
+The performance value is a relative throughput value among all paths
+in a path-group, and it can be specified as a table argument.
+The path selector name is 'service-time'.
+Table parameters for each path: [<repeat_count> [<relative_throughput>]]
+        <repeat_count>: The number of I/Os to dispatch using the selected
+                        path before switching to the next path.
+                        If not given, internal default is used.  To check
+                        the default value, see the activated table.
+        <relative_throughput>: The relative throughput value of the path
+                        among all paths in the path-group.
+                        The valid range is 0-100.
+                        If not given, minimum value '1' is used.
+                        If '0' is given, the path isn't selected while
+                        other paths having a positive value are available.
+Status for each path: <status> <fail-count> <in-flight-size> \
+                      <relative_throughput>
+        <status>: 'A' if the path is active, 'F' if the path is failed.
+        <fail-count>: The number of path failures.
+        <in-flight-size>: The size of in-flight I/Os on the path.
+        <relative_throughput>: The relative throughput value of the path
+                        among all paths in the path-group.
+Algorithm
+=========
+dm-service-time adds the I/O size to 'in-flight-size' when the I/O is
+dispatched and substracts when completed.
+Basically, dm-service-time selects a path having minimum service time
+which is calculated by:
+        ('in-flight-size' + 'size-of-incoming-io') / 'relative_throughput'
+However, some optimizations below are used to reduce the calculation
+as much as possible.
+        1. If the paths have the same 'relative_throughput', skip
+           the division and just compare the 'in-flight-size'.
+        2. If the paths have the same 'in-flight-size', skip the division
+           and just compare the 'relative_throughput'.
+        3. If some paths have non-zero 'relative_throughput' and others
+           have zero 'relative_throughput', ignore those paths with zero
+           'relative_throughput'.
+If such optimizations can't be applied, calculate service time, and
+compare service time.
+If calculated service time is equal, the path having maximum
+'relative_throughput' may be better.  So compare 'relative_throughput'
+then.
+Examples
+========
+In case that 2 paths (sda and sdb) are used with repeat_count == 128
+and sda has an average throughput 1GB/s and sdb has 4GB/s,
+'relative_throughput' value may be '1' for sda and '4' for sdb.
+# echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4" \
+  dmsetup create test
+#
+# dmsetup table
+test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4
+#
+# dmsetup status
+test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 1 8:16 A 0 0 4
+Or '2' for sda and '8' for sdb would be also true.
+# echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8" \
+  dmsetup create test
+#
+# dmsetup table
+test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8
+#
+# dmsetup status
+test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 2 8:16 A 0 0 8
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 36e0675be9f7..020f9573fd82 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -231,6 +231,17 @@ config DM_MIRROR
         Allow volume managers to mirror logical volumes, also
         needed for live data migration tools such as 'pvmove'.
+config DM_LOG_USERSPACE
+        tristate "Mirror userspace logging (EXPERIMENTAL)"
+        depends on DM_MIRROR && EXPERIMENTAL && NET
+        select CONNECTOR
+        ---help---
+          The userspace logging module provides a mechanism for
+          relaying the dm-dirty-log API to userspace.  Log designs
+          which are more suited to userspace implementation (e.g.
+          shared storage logs) or experimental logs can be implemented
+          by leveraging this framework.
 config DM_ZERO
        tristate "Zero target"
        depends on BLK_DEV_DM
@@ -249,6 +260,25 @@ config DM_MULTIPATH
        ---help---
          Allow volume managers to support multipath hardware.
+config DM_MULTIPATH_QL
+        tristate "I/O Path Selector based on the number of in-flight I/Os"
+        depends on DM_MULTIPATH
+        ---help---
+          This path selector is a dynamic load balancer which selects
+          the path with the least number of in-flight I/Os.
+          If unsure, say N.
+config DM_MULTIPATH_ST
+        tristate "I/O Path Selector based on the service time"
+        depends on DM_MULTIPATH
+        ---help---
+          This path selector is a dynamic load balancer which selects
+          the path expected to complete the incoming I/O in the shortest
+          time.
+          If unsure, say N.
 config DM_DELAY
        tristate "I/O delaying target (EXPERIMENTAL)"
        depends on BLK_DEV_DM && EXPERIMENTAL
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 45cc5951d928..1dc4185bd781 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -8,6 +8,8 @@ dm-multipath-y	+= dm-path-selector.o dm-mpath.o
 dm-snapshot-y   += dm-snap.o dm-exception-store.o dm-snap-transient.o \
                    dm-snap-persistent.o
 dm-mirror-y     += dm-raid1.o
+dm-log-userspace-y \
+                += dm-log-userspace-base.o dm-log-userspace-transfer.o
 md-mod-y        += md.o bitmap.o
 raid456-y       += raid5.o
 raid6_pq-y      += raid6algos.o raid6recov.o raid6tables.o \
@@ -36,8 +38,11 @@ obj-$(CONFIG_BLK_DEV_DM)	+= dm-mod.o
 obj-$(CONFIG_DM_CRYPT)          += dm-crypt.o
 obj-$(CONFIG_DM_DELAY)          += dm-delay.o
 obj-$(CONFIG_DM_MULTIPATH)      += dm-multipath.o dm-round-robin.o
+obj-$(CONFIG_DM_MULTIPATH_QL)   += dm-queue-length.o
+obj-$(CONFIG_DM_MULTIPATH_ST)   += dm-service-time.o
 obj-$(CONFIG_DM_SNAPSHOT)       += dm-snapshot.o
 obj-$(CONFIG_DM_MIRROR)         += dm-mirror.o dm-log.o dm-region-hash.o
+obj-$(CONFIG_DM_LOG_USERSPACE)  += dm-log-userspace.o
 obj-$(CONFIG_DM_ZERO)           += dm-zero.o
 quiet_cmd_unroll = UNROLL  $@
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 53394e863c74..9933eb861c71 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1132,6 +1132,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                goto bad_crypt_queue;
        }
+        ti->num_flush_requests = 1;
        ti->private = cc;
        return 0;
@@ -1189,6 +1190,13 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
                     union map_info *map_context)
 {
        struct dm_crypt_io *io;
+        struct crypt_config *cc;
+        if (unlikely(bio_empty_barrier(bio))) {
+                cc = ti->private;
+                bio->bi_bdev = cc->dev->bdev;
+                return DM_MAPIO_REMAPPED;
+        }
        io = crypt_io_alloc(ti, bio, bio->bi_sector - ti->begin);
@@ -1305,9 +1313,17 @@ static int crypt_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
        return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
 }
+static int crypt_iterate_devices(struct dm_target *ti,
+                                 iterate_devices_callout_fn fn, void *data)
+{
+        struct crypt_config *cc = ti->private;
+        return fn(ti, cc->dev, cc->start, data);
+}
 static struct target_type crypt_target = {
        .name   = "crypt",
-        .version= {1, 6, 0},
+        .version = {1, 7, 0},
        .module = THIS_MODULE,
        .ctr    = crypt_ctr,
        .dtr    = crypt_dtr,
@@ -1318,6 +1334,7 @@ static struct target_type crypt_target = {
        .resume = crypt_resume,
        .message = crypt_message,
        .merge  = crypt_merge,
+        .iterate_devices = crypt_iterate_devices,
 };
 static int __init dm_crypt_init(void)
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 559dbb52bc85..4e5b843cd4d7 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -197,6 +197,7 @@ out:
        mutex_init(&dc->timer_lock);
        atomic_set(&dc->may_delay, 1);
+        ti->num_flush_requests = 1;
        ti->private = dc;
        return 0;
@@ -278,8 +279,9 @@ static int delay_map(struct dm_target *ti, struct bio *bio,
        if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) {
                bio->bi_bdev = dc->dev_write->bdev;
-                bio->bi_sector = dc->start_write +
+                if (bio_sectors(bio))
-                                 (bio->bi_sector - ti->begin);
+                        bio->bi_sector = dc->start_write +
+                                         (bio->bi_sector - ti->begin);
                return delay_bio(dc, dc->write_delay, bio);
        }
@@ -316,9 +318,26 @@ static int delay_status(struct dm_target *ti, status_type_t type,
        return 0;
 }
+static int delay_iterate_devices(struct dm_target *ti,
+                                 iterate_devices_callout_fn fn, void *data)
+{
+        struct delay_c *dc = ti->private;
+        int ret = 0;
+        ret = fn(ti, dc->dev_read, dc->start_read, data);
+        if (ret)
+                goto out;
+        if (dc->dev_write)
+                ret = fn(ti, dc->dev_write, dc->start_write, data);
+out:
+        return ret;
+}
 static struct target_type delay_target = {
        .name        = "delay",
-        .version     = {1, 0, 2},
+        .version     = {1, 1, 0},
        .module      = THIS_MODULE,
        .ctr         = delay_ctr,
        .dtr         = delay_dtr,
@@ -326,6 +345,7 @@ static struct target_type delay_target = {
        .presuspend  = delay_presuspend,
        .resume      = delay_resume,
        .status      = delay_status,
+        .iterate_devices = delay_iterate_devices,
 };
 static int __init dm_delay_init(void)
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 75d8081a9041..c3ae51584b12 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -216,7 +216,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
                return -EINVAL;
        }
-        type = get_type(argv[1]);
+        type = get_type(&persistent);
        if (!type) {
                ti->error = "Exception store type not recognised";
                r = -EINVAL;
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index c92701dc5001..2442c8c07898 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -156,7 +156,7 @@ static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
 */
 static inline sector_t get_dev_size(struct block_device *bdev)
 {
-        return bdev->bd_inode->i_size >> SECTOR_SHIFT;
+        return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
 }
 static inline chunk_t sector_to_chunk(struct dm_exception_store *store,
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index e73aabd61cd7..3a2e6a2f8bdd 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -22,6 +22,7 @@ struct dm_io_client {
 /* FIXME: can we shrink this ? */
 struct io {
        unsigned long error_bits;
+        unsigned long eopnotsupp_bits;
        atomic_t count;
        struct task_struct *sleeper;
        struct dm_io_client *client;
@@ -107,8 +108,11 @@ static inline unsigned bio_get_region(struct bio *bio)
 *---------------------------------------------------------------*/
 static void dec_count(struct io *io, unsigned int region, int error)
 {
-        if (error)
+        if (error) {
                set_bit(region, &io->error_bits);
+                if (error == -EOPNOTSUPP)
+                        set_bit(region, &io->eopnotsupp_bits);
+        }
        if (atomic_dec_and_test(&io->count)) {
                if (io->sleeper)
@@ -360,7 +364,9 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
                return -EIO;
        }
+retry:
        io.error_bits = 0;
+        io.eopnotsupp_bits = 0;
        atomic_set(&io.count, 1); /* see dispatch_io() */
        io.sleeper = current;
        io.client = client;
@@ -377,6 +383,11 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
        }
        set_current_state(TASK_RUNNING);
+        if (io.eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) {
+                rw &= ~(1 << BIO_RW_BARRIER);
+                goto retry;
+        }
        if (error_bits)
                *error_bits = io.error_bits;
@@ -397,6 +408,7 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
        io = mempool_alloc(client->pool, GFP_NOIO);
        io->error_bits = 0;
+        io->eopnotsupp_bits = 0;
        atomic_set(&io->count, 1); /* see dispatch_io() */
        io->sleeper = NULL;
        io->client = client;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 1128d3fba797..7f77f18fcafa 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -276,7 +276,7 @@ retry:
        up_write(&_hash_lock);
 }
-static int dm_hash_rename(const char *old, const char *new)
+static int dm_hash_rename(uint32_t cookie, const char *old, const char *new)
 {
        char *new_name, *old_name;
        struct hash_cell *hc;
@@ -333,7 +333,7 @@ static int dm_hash_rename(const char *old, const char *new)
                dm_table_put(table);
        }
-        dm_kobject_uevent(hc->md);
+        dm_kobject_uevent(hc->md, KOBJ_CHANGE, cookie);
        dm_put(hc->md);
        up_write(&_hash_lock);
@@ -680,6 +680,9 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
        __hash_remove(hc);
        up_write(&_hash_lock);
+        dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr);
        dm_put(md);
        param->data_size = 0;
        return 0;
@@ -715,7 +718,7 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
                return r;
        param->data_size = 0;
-        return dm_hash_rename(param->name, new_name);
+        return dm_hash_rename(param->event_nr, param->name, new_name);
 }
 static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
@@ -842,8 +845,11 @@ static int do_resume(struct dm_ioctl *param)
        if (dm_suspended(md))
                r = dm_resume(md);
-        if (!r)
+        if (!r) {
+                dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr);
                r = __dev_status(md, param);
+        }
        dm_put(md);
        return r;
@@ -1044,6 +1050,12 @@ static int populate_table(struct dm_table *table,
                next = spec->next;
        }
+        r = dm_table_set_type(table);
+        if (r) {
+                DMWARN("unable to set table type");
+                return r;
+        }
        return dm_table_complete(table);
 }
@@ -1089,6 +1101,13 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
                goto out;
        }
+        r = dm_table_alloc_md_mempools(t);
+        if (r) {
+                DMWARN("unable to allocate mempools for this table");
+                dm_table_destroy(t);
+                goto out;
+        }
        down_write(&_hash_lock);
        hc = dm_get_mdptr(md);
        if (!hc || hc->md != md) {
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 79fb53e51c70..9184b6deb868 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -53,6 +53,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                goto bad;
        }
+        ti->num_flush_requests = 1;
        ti->private = lc;
        return 0;
@@ -81,7 +82,8 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio)
        struct linear_c *lc = ti->private;
        bio->bi_bdev = lc->dev->bdev;
-        bio->bi_sector = linear_map_sector(ti, bio->bi_sector);
+        if (bio_sectors(bio))
+                bio->bi_sector = linear_map_sector(ti, bio->bi_sector);
 }
 static int linear_map(struct dm_target *ti, struct bio *bio,
@@ -132,9 +134,17 @@ static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
        return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
 }
+static int linear_iterate_devices(struct dm_target *ti,
+                                  iterate_devices_callout_fn fn, void *data)
+{
+        struct linear_c *lc = ti->private;
+        return fn(ti, lc->dev, lc->start, data);
+}
 static struct target_type linear_target = {
        .name   = "linear",
-        .version= {1, 0, 3},
+        .version = {1, 1, 0},
        .module = THIS_MODULE,
        .ctr    = linear_ctr,
        .dtr    = linear_dtr,
@@ -142,6 +152,7 @@ static struct target_type linear_target = {
        .status = linear_status,
        .ioctl  = linear_ioctl,
        .merge  = linear_merge,
+        .iterate_devices = linear_iterate_devices,
 };
 int __init dm_linear_init(void)
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
new file mode 100644
index 000000000000..e69b96560997
--- /dev/null
+++ b/drivers/md/dm-log-userspace-base.c
@@ -0,0 +1,696 @@
+/*
+ * Copyright (C) 2006-2009 Red Hat, Inc.
+ *
+ * This file is released under the LGPL.
+ */
+#include <linux/bio.h>
+#include <linux/dm-dirty-log.h>
+#include <linux/device-mapper.h>
+#include <linux/dm-log-userspace.h>
+#include "dm-log-userspace-transfer.h"
+struct flush_entry {
+        int type;
+        region_t region;
+        struct list_head list;
+};
+struct log_c {
+        struct dm_target *ti;
+        uint32_t region_size;
+        region_t region_count;
+        char uuid[DM_UUID_LEN];
+        char *usr_argv_str;
+        uint32_t usr_argc;
+        /*
+         * in_sync_hint gets set when doing is_remote_recovering.  It
+         * represents the first region that needs recovery.  IOW, the
+         * first zero bit of sync_bits.  This can be useful for to limit
+         * traffic for calls like is_remote_recovering and get_resync_work,
+         * but be take care in its use for anything else.
+         */
+        uint64_t in_sync_hint;
+        spinlock_t flush_lock;
+        struct list_head flush_list;  /* only for clear and mark requests */
+};
+static mempool_t *flush_entry_pool;
+static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data)
+{
+        return kmalloc(sizeof(struct flush_entry), gfp_mask);
+}
+static void flush_entry_free(void *element, void *pool_data)
+{
+        kfree(element);
+}
+static int userspace_do_request(struct log_c *lc, const char *uuid,
+                                int request_type, char *data, size_t data_size,
+                                char *rdata, size_t *rdata_size)
+{
+        int r;
+        /*
+         * If the server isn't there, -ESRCH is returned,
+         * and we must keep trying until the server is
+         * restored.
+         */
+retry:
+        r = dm_consult_userspace(uuid, request_type, data,
+                                 data_size, rdata, rdata_size);
+        if (r != -ESRCH)
+                return r;
+        DMERR(" Userspace log server not found.");
+        while (1) {
+                set_current_state(TASK_INTERRUPTIBLE);
+                schedule_timeout(2*HZ);
+                DMWARN("Attempting to contact userspace log server...");
+                r = dm_consult_userspace(uuid, DM_ULOG_CTR, lc->usr_argv_str,
+                                         strlen(lc->usr_argv_str) + 1,
+                                         NULL, NULL);
+                if (!r)
+                        break;
+        }
+        DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete");
+        r = dm_consult_userspace(uuid, DM_ULOG_RESUME, NULL,
+                                 0, NULL, NULL);
+        if (!r)
+                goto retry;
+        DMERR("Error trying to resume userspace log: %d", r);
+        return -ESRCH;
+}
+static int build_constructor_string(struct dm_target *ti,
+                                    unsigned argc, char **argv,
+                                    char **ctr_str)
+{
+        int i, str_size;
+        char *str = NULL;
+        *ctr_str = NULL;
+        for (i = 0, str_size = 0; i < argc; i++)
+                str_size += strlen(argv[i]) + 1; /* +1 for space between args */
+        str_size += 20; /* Max number of chars in a printed u64 number */
+        str = kzalloc(str_size, GFP_KERNEL);
+        if (!str) {
+                DMWARN("Unable to allocate memory for constructor string");
+                return -ENOMEM;
+        }
+        for (i = 0, str_size = 0; i < argc; i++)
+                str_size += sprintf(str + str_size, "%s ", argv[i]);
+        str_size += sprintf(str + str_size, "%llu",
+                            (unsigned long long)ti->len);
+        *ctr_str = str;
+        return str_size;
+}
+/*
+ * userspace_ctr
+ *
+ * argv contains:
+ *      <UUID> <other args>
+ * Where 'other args' is the userspace implementation specific log
+ * arguments.  An example might be:
+ *      <UUID> clustered_disk <arg count> <log dev> <region_size> [[no]sync]
+ *
+ * So, this module will strip off the <UUID> for identification purposes
+ * when communicating with userspace about a log; but will pass on everything
+ * else.
+ */
+static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
+                         unsigned argc, char **argv)
+{
+        int r = 0;
+        int str_size;
+        char *ctr_str = NULL;
+        struct log_c *lc = NULL;
+        uint64_t rdata;
+        size_t rdata_size = sizeof(rdata);
+        if (argc < 3) {
+                DMWARN("Too few arguments to userspace dirty log");
+                return -EINVAL;
+        }
+        lc = kmalloc(sizeof(*lc), GFP_KERNEL);
+        if (!lc) {
+                DMWARN("Unable to allocate userspace log context.");
+                return -ENOMEM;
+        }
+        lc->ti = ti;
+        if (strlen(argv[0]) > (DM_UUID_LEN - 1)) {
+                DMWARN("UUID argument too long.");
+                kfree(lc);
+                return -EINVAL;
+        }
+        strncpy(lc->uuid, argv[0], DM_UUID_LEN);
+        spin_lock_init(&lc->flush_lock);
+        INIT_LIST_HEAD(&lc->flush_list);
+        str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str);
+        if (str_size < 0) {
+                kfree(lc);
+                return str_size;
+        }
+        /* Send table string */
+        r = dm_consult_userspace(lc->uuid, DM_ULOG_CTR,
+                                 ctr_str, str_size, NULL, NULL);
+        if (r == -ESRCH) {
+                DMERR("Userspace log server not found");
+                goto out;
+        }
+        /* Since the region size does not change, get it now */
+        rdata_size = sizeof(rdata);
+        r = dm_consult_userspace(lc->uuid, DM_ULOG_GET_REGION_SIZE,
+                                 NULL, 0, (char *)&rdata, &rdata_size);
+        if (r) {
+                DMERR("Failed to get region size of dirty log");
+                goto out;
+        }
+        lc->region_size = (uint32_t)rdata;
+        lc->region_count = dm_sector_div_up(ti->len, lc->region_size);
+out:
+        if (r) {
+                kfree(lc);
+                kfree(ctr_str);
+        } else {
+                lc->usr_argv_str = ctr_str;
+                lc->usr_argc = argc;
+                log->context = lc;
+        }
+        return r;
+}
+static void userspace_dtr(struct dm_dirty_log *log)
+{
+        int r;
+        struct log_c *lc = log->context;
+        r = dm_consult_userspace(lc->uuid, DM_ULOG_DTR,
+                                 NULL, 0,
+                                 NULL, NULL);
+        kfree(lc->usr_argv_str);
+        kfree(lc);
+        return;
+}
+static int userspace_presuspend(struct dm_dirty_log *log)
+{
+        int r;
+        struct log_c *lc = log->context;
+        r = dm_consult_userspace(lc->uuid, DM_ULOG_PRESUSPEND,
+                                 NULL, 0,
+                                 NULL, NULL);
+        return r;
+}
+static int userspace_postsuspend(struct dm_dirty_log *log)
+{
+        int r;
+        struct log_c *lc = log->context;
+        r = dm_consult_userspace(lc->uuid, DM_ULOG_POSTSUSPEND,
+                                 NULL, 0,
+                                 NULL, NULL);
+        return r;
+}
+static int userspace_resume(struct dm_dirty_log *log)
+{
+        int r;
+        struct log_c *lc = log->context;
+        lc->in_sync_hint = 0;
+        r = dm_consult_userspace(lc->uuid, DM_ULOG_RESUME,
+                                 NULL, 0,
+                                 NULL, NULL);
+        return r;
+}
+static uint32_t userspace_get_region_size(struct dm_dirty_log *log)
+{
+        struct log_c *lc = log->context;
+        return lc->region_size;
+}
+/*
+ * userspace_is_clean
+ *
+ * Check whether a region is clean.  If there is any sort of
+ * failure when consulting the server, we return not clean.
+ *
+ * Returns: 1 if clean, 0 otherwise
+ */
+static int userspace_is_clean(struct dm_dirty_log *log, region_t region)
+{
+        int r;
+        uint64_t region64 = (uint64_t)region;
+        int64_t is_clean;
+        size_t rdata_size;
+        struct log_c *lc = log->context;
+        rdata_size = sizeof(is_clean);
+        r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_CLEAN,
+                                 (char *)&region64, sizeof(region64),
+                                 (char *)&is_clean, &rdata_size);
+        return (r) ? 0 : (int)is_clean;
+}
+/*
+ * userspace_in_sync
+ *
+ * Check if the region is in-sync.  If there is any sort
+ * of failure when consulting the server, we assume that
+ * the region is not in sync.
+ *
+ * If 'can_block' is set, return immediately
+ *
+ * Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK
+ */
+static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
+                             int can_block)
+{
+        int r;
+        uint64_t region64 = region;
+        int64_t in_sync;
+        size_t rdata_size;
+        struct log_c *lc = log->context;
+        /*
+         * We can never respond directly - even if in_sync_hint is
+         * set.  This is because another machine could see a device
+         * failure and mark the region out-of-sync.  If we don't go
+         * to userspace to ask, we might think the region is in-sync
+         * and allow a read to pick up data that is stale.  (This is
+         * very unlikely if a device actually fails; but it is very
+         * likely if a connection to one device from one machine fails.)
+         *
+         * There still might be a problem if the mirror caches the region
+         * state as in-sync... but then this call would not be made.  So,
+         * that is a mirror problem.
+         */
+        if (!can_block)
+                return -EWOULDBLOCK;
+        rdata_size = sizeof(in_sync);
+        r = userspace_do_request(lc, lc->uuid, DM_ULOG_IN_SYNC,
+                                 (char *)&region64, sizeof(region64),
+                                 (char *)&in_sync, &rdata_size);
+        return (r) ? 0 : (int)in_sync;
+}
+/*
+ * userspace_flush
+ *
+ * This function is ok to block.
+ * The flush happens in two stages.  First, it sends all
+ * clear/mark requests that are on the list.  Then it
+ * tells the server to commit them.  This gives the
+ * server a chance to optimise the commit, instead of
+ * doing it for every request.
+ *
+ * Additionally, we could implement another thread that
+ * sends the requests up to the server - reducing the
+ * load on flush.  Then the flush would have less in
+ * the list and be responsible for the finishing commit.
+ *
+ * Returns: 0 on success, < 0 on failure
+ */
+static int userspace_flush(struct dm_dirty_log *log)
+{
+        int r = 0;
+        unsigned long flags;
+        struct log_c *lc = log->context;
+        LIST_HEAD(flush_list);
+        struct flush_entry *fe, *tmp_fe;
+        spin_lock_irqsave(&lc->flush_lock, flags);
+        list_splice_init(&lc->flush_list, &flush_list);
+        spin_unlock_irqrestore(&lc->flush_lock, flags);
+        if (list_empty(&flush_list))
+                return 0;
+        /*
+         * FIXME: Count up requests, group request types,
+         * allocate memory to stick all requests in and
+         * send to server in one go.  Failing the allocation,
+         * do it one by one.
+         */
+        list_for_each_entry(fe, &flush_list, list) {
+                r = userspace_do_request(lc, lc->uuid, fe->type,
+                                         (char *)&fe->region,
+                                         sizeof(fe->region),
+                                         NULL, NULL);
+                if (r)
+                        goto fail;
+        }
+        r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
+                                 NULL, 0, NULL, NULL);
+fail:
+        /*
+         * We can safely remove these entries, even if failure.
+         * Calling code will receive an error and will know that
+         * the log facility has failed.
+         */
+        list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) {
+                list_del(&fe->list);
+                mempool_free(fe, flush_entry_pool);
+        }
+        if (r)
+                dm_table_event(lc->ti->table);
+        return r;
+}
+/*
+ * userspace_mark_region
+ *
+ * This function should avoid blocking unless absolutely required.
+ * (Memory allocation is valid for blocking.)
+ */
+static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
+{
+        unsigned long flags;
+        struct log_c *lc = log->context;
+        struct flush_entry *fe;
+        /* Wait for an allocation, but _never_ fail */
+        fe = mempool_alloc(flush_entry_pool, GFP_NOIO);
+        BUG_ON(!fe);
+        spin_lock_irqsave(&lc->flush_lock, flags);
+        fe->type = DM_ULOG_MARK_REGION;
+        fe->region = region;
+        list_add(&fe->list, &lc->flush_list);
+        spin_unlock_irqrestore(&lc->flush_lock, flags);
+        return;
+}
+/*
+ * userspace_clear_region
+ *
+ * This function must not block.
+ * So, the alloc can't block.  In the worst case, it is ok to
+ * fail.  It would simply mean we can't clear the region.
+ * Does nothing to current sync context, but does mean
+ * the region will be re-sync'ed on a reload of the mirror
+ * even though it is in-sync.
+ */
+static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
+{
+        unsigned long flags;
+        struct log_c *lc = log->context;
+        struct flush_entry *fe;
+        /*
+         * If we fail to allocate, we skip the clearing of
+         * the region.  This doesn't hurt us in any way, except
+         * to cause the region to be resync'ed when the
+         * device is activated next time.
+         */
+        fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC);
+        if (!fe) {
+                DMERR("Failed to allocate memory to clear region.");
+                return;
+        }
+        spin_lock_irqsave(&lc->flush_lock, flags);
+        fe->type = DM_ULOG_CLEAR_REGION;
+        fe->region = region;
+        list_add(&fe->list, &lc->flush_list);
+        spin_unlock_irqrestore(&lc->flush_lock, flags);
+        return;
+}
+/*
+ * userspace_get_resync_work
+ *
+ * Get a region that needs recovery.  It is valid to return
+ * an error for this function.
+ *
+ * Returns: 1 if region filled, 0 if no work, <0 on error
+ */
+static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region)
+{
+        int r;
+        size_t rdata_size;
+        struct log_c *lc = log->context;
+        struct {
+                int64_t i; /* 64-bit for mix arch compatibility */
+                region_t r;
+        } pkg;
+        if (lc->in_sync_hint >= lc->region_count)
+                return 0;
+        rdata_size = sizeof(pkg);
+        r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK,
+                                 NULL, 0,
+                                 (char *)&pkg, &rdata_size);
+        *region = pkg.r;
+        return (r) ? r : (int)pkg.i;
+}
+/*
+ * userspace_set_region_sync
+ *
+ * Set the sync status of a given region.  This function
+ * must not fail.
+ */
+static void userspace_set_region_sync(struct dm_dirty_log *log,
+                                      region_t region, int in_sync)
+{
+        int r;
+        struct log_c *lc = log->context;
+        struct {
+                region_t r;
+                int64_t i;
+        } pkg;
+        pkg.r = region;
+        pkg.i = (int64_t)in_sync;
+        r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
+                                 (char *)&pkg, sizeof(pkg),
+                                 NULL, NULL);
+        /*
+         * It would be nice to be able to report failures.
+         * However, it is easy emough to detect and resolve.
+         */
+        return;
+}
+/*
+ * userspace_get_sync_count
+ *
+ * If there is any sort of failure when consulting the server,
+ * we assume that the sync count is zero.
+ *
+ * Returns: sync count on success, 0 on failure
+ */
+static region_t userspace_get_sync_count(struct dm_dirty_log *log)
+{
+        int r;
+        size_t rdata_size;
+        uint64_t sync_count;
+        struct log_c *lc = log->context;
+        rdata_size = sizeof(sync_count);
+        r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT,
+                                 NULL, 0,
+                                 (char *)&sync_count, &rdata_size);
+        if (r)
+                return 0;
+        if (sync_count >= lc->region_count)
+                lc->in_sync_hint = lc->region_count;
+        return (region_t)sync_count;
+}
+/*
+ * userspace_status
+ *
+ * Returns: amount of space consumed
+ */
+static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
+                            char *result, unsigned maxlen)
+{
+        int r = 0;
+        size_t sz = (size_t)maxlen;
+        struct log_c *lc = log->context;
+        switch (status_type) {
+        case STATUSTYPE_INFO:
+                r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO,
+                                         NULL, 0,
+                                         result, &sz);
+                if (r) {
+                        sz = 0;
+                        DMEMIT("%s 1 COM_FAILURE", log->type->name);
+                }
+                break;
+        case STATUSTYPE_TABLE:
+                sz = 0;
+                DMEMIT("%s %u %s %s", log->type->name, lc->usr_argc + 1,
+                       lc->uuid, lc->usr_argv_str);
+                break;
+        }
+        return (r) ? 0 : (int)sz;
+}
+/*
+ * userspace_is_remote_recovering
+ *
+ * Returns: 1 if region recovering, 0 otherwise
+ */
+static int userspace_is_remote_recovering(struct dm_dirty_log *log,
+                                          region_t region)
+{
+        int r;
+        uint64_t region64 = region;
+        struct log_c *lc = log->context;
+        static unsigned long long limit;
+        struct {
+                int64_t is_recovering;
+                uint64_t in_sync_hint;
+        } pkg;
+        size_t rdata_size = sizeof(pkg);
+        /*
+         * Once the mirror has been reported to be in-sync,
+         * it will never again ask for recovery work.  So,
+         * we can safely say there is not a remote machine
+         * recovering if the device is in-sync.  (in_sync_hint
+         * must be reset at resume time.)
+         */
+        if (region < lc->in_sync_hint)
+                return 0;
+        else if (jiffies < limit)
+                return 1;
+        limit = jiffies + (HZ / 4);
+        r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_REMOTE_RECOVERING,
+                                 (char *)&region64, sizeof(region64),
+                                 (char *)&pkg, &rdata_size);
+        if (r)
+                return 1;
+        lc->in_sync_hint = pkg.in_sync_hint;
+        return (int)pkg.is_recovering;
+}
+static struct dm_dirty_log_type _userspace_type = {
+        .name = "userspace",
+        .module = THIS_MODULE,
+        .ctr = userspace_ctr,
+        .dtr = userspace_dtr,
+        .presuspend = userspace_presuspend,
+        .postsuspend = userspace_postsuspend,
+        .resume = userspace_resume,
+        .get_region_size = userspace_get_region_size,
+        .is_clean = userspace_is_clean,
+        .in_sync = userspace_in_sync,
+        .flush = userspace_flush,
+        .mark_region = userspace_mark_region,
+        .clear_region = userspace_clear_region,
+        .get_resync_work = userspace_get_resync_work,
+        .set_region_sync = userspace_set_region_sync,
+        .get_sync_count = userspace_get_sync_count,
+        .status = userspace_status,
+        .is_remote_recovering = userspace_is_remote_recovering,
+};
+static int __init userspace_dirty_log_init(void)
+{
+        int r = 0;
+        flush_entry_pool = mempool_create(100, flush_entry_alloc,
+                                          flush_entry_free, NULL);
+        if (!flush_entry_pool) {
+                DMWARN("Unable to create flush_entry_pool:  No memory.");
+                return -ENOMEM;
+        }
+        r = dm_ulog_tfr_init();
+        if (r) {
+                DMWARN("Unable to initialize userspace log communications");
+                mempool_destroy(flush_entry_pool);
+                return r;
+        }
+        r = dm_dirty_log_type_register(&_userspace_type);
+        if (r) {
+                DMWARN("Couldn't register userspace dirty log type");
+                dm_ulog_tfr_exit();
+                mempool_destroy(flush_entry_pool);
+                return r;
+        }
+        DMINFO("version 1.0.0 loaded");
+        return 0;
+}
+static void __exit userspace_dirty_log_exit(void)
+{
+        dm_dirty_log_type_unregister(&_userspace_type);
+        dm_ulog_tfr_exit();
+        mempool_destroy(flush_entry_pool);
+        DMINFO("version 1.0.0 unloaded");
+        return;
+}
+module_init(userspace_dirty_log_init);
+module_exit(userspace_dirty_log_exit);
+MODULE_DESCRIPTION(DM_NAME " userspace dirty log link");
+MODULE_AUTHOR("Jonathan Brassow <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
new file mode 100644
index 000000000000..0ca1ee768a1f
--- /dev/null
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -0,0 +1,276 @@
+/*
+ * Copyright (C) 2006-2009 Red Hat, Inc.
+ *
+ * This file is released under the LGPL.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <net/sock.h>
+#include <linux/workqueue.h>
+#include <linux/connector.h>
+#include <linux/device-mapper.h>
+#include <linux/dm-log-userspace.h>
+#include "dm-log-userspace-transfer.h"
+static uint32_t dm_ulog_seq;
+/*
+ * Netlink/Connector is an unreliable protocol.  How long should
+ * we wait for a response before assuming it was lost and retrying?
+ * (If we do receive a response after this time, it will be discarded
+ * and the response to the resent request will be waited for.
+ */
+#define DM_ULOG_RETRY_TIMEOUT (15 * HZ)
+/*
+ * Pre-allocated space for speed
+ */
+#define DM_ULOG_PREALLOCED_SIZE 512
+static struct cn_msg *prealloced_cn_msg;
+static struct dm_ulog_request *prealloced_ulog_tfr;
+static struct cb_id ulog_cn_id = {
+        .idx = CN_IDX_DM,
+        .val = CN_VAL_DM_USERSPACE_LOG
+};
+static DEFINE_MUTEX(dm_ulog_lock);
+struct receiving_pkg {
+        struct list_head list;
+        struct completion complete;
+        uint32_t seq;
+        int error;
+        size_t *data_size;
+        char *data;
+};
+static DEFINE_SPINLOCK(receiving_list_lock);
+static struct list_head receiving_list;
+static int dm_ulog_sendto_server(struct dm_ulog_request *tfr)
+{
+        int r;
+        struct cn_msg *msg = prealloced_cn_msg;
+        memset(msg, 0, sizeof(struct cn_msg));
+        msg->id.idx = ulog_cn_id.idx;
+        msg->id.val = ulog_cn_id.val;
+        msg->ack = 0;
+        msg->seq = tfr->seq;
+        msg->len = sizeof(struct dm_ulog_request) + tfr->data_size;
+        r = cn_netlink_send(msg, 0, gfp_any());
+        return r;
+}
+/*
+ * Parameters for this function can be either msg or tfr, but not
+ * both.  This function fills in the reply for a waiting request.
+ * If just msg is given, then the reply is simply an ACK from userspace
+ * that the request was received.
+ *
+ * Returns: 0 on success, -ENOENT on failure
+ */
+static int fill_pkg(struct cn_msg *msg, struct dm_ulog_request *tfr)
+{
+        uint32_t rtn_seq = (msg) ? msg->seq : (tfr) ? tfr->seq : 0;
+        struct receiving_pkg *pkg;
+        /*
+         * The 'receiving_pkg' entries in this list are statically
+         * allocated on the stack in 'dm_consult_userspace'.
+         * Each process that is waiting for a reply from the user
+         * space server will have an entry in this list.
+         *
+         * We are safe to do it this way because the stack space
+         * is unique to each process, but still addressable by
+         * other processes.
+         */
+        list_for_each_entry(pkg, &receiving_list, list) {
+                if (rtn_seq != pkg->seq)
+                        continue;
+                if (msg) {
+                        pkg->error = -msg->ack;
+                        /*
+                         * If we are trying again, we will need to know our
+                         * storage capacity.  Otherwise, along with the
+                         * error code, we make explicit that we have no data.
+                         */
+                        if (pkg->error != -EAGAIN)
+                                *(pkg->data_size) = 0;
+                } else if (tfr->data_size > *(pkg->data_size)) {
+                        DMERR("Insufficient space to receive package [%u] "
+                              "(%u vs %lu)", tfr->request_type,
+                              tfr->data_size, *(pkg->data_size));
+                        *(pkg->data_size) = 0;
+                        pkg->error = -ENOSPC;
+                } else {
+                        pkg->error = tfr->error;
+                        memcpy(pkg->data, tfr->data, tfr->data_size);
+                        *(pkg->data_size) = tfr->data_size;
+                }
+                complete(&pkg->complete);
+                return 0;
+        }
+        return -ENOENT;
+}
+/*
+ * This is the connector callback that delivers data
+ * that was sent from userspace.
+ */
+static void cn_ulog_callback(void *data)
+{
+        struct cn_msg *msg = (struct cn_msg *)data;
+        struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1);
+        spin_lock(&receiving_list_lock);
+        if (msg->len == 0)
+                fill_pkg(msg, NULL);
+        else if (msg->len < sizeof(*tfr))
+                DMERR("Incomplete message received (expected %u, got %u): [%u]",
+                      (unsigned)sizeof(*tfr), msg->len, msg->seq);
+        else
+                fill_pkg(NULL, tfr);
+        spin_unlock(&receiving_list_lock);
+}
+/**
+ * dm_consult_userspace
+ * @uuid: log's uuid (must be DM_UUID_LEN in size)
+ * @request_type:  found in include/linux/dm-log-userspace.h
+ * @data: data to tx to the server
+ * @data_size: size of data in bytes
+ * @rdata: place to put return data from server
+ * @rdata_size: value-result (amount of space given/amount of space used)
+ *
+ * rdata_size is undefined on failure.
+ *
+ * Memory used to communicate with userspace is zero'ed
+ * before populating to ensure that no unwanted bits leak
+ * from kernel space to user-space.  All userspace log communications
+ * between kernel and user space go through this function.
+ *
+ * Returns: 0 on success, -EXXX on failure
+ **/
+int dm_consult_userspace(const char *uuid, int request_type,
+                         char *data, size_t data_size,
+                         char *rdata, size_t *rdata_size)
+{
+        int r = 0;
+        size_t dummy = 0;
+        int overhead_size =
+                sizeof(struct dm_ulog_request *) + sizeof(struct cn_msg);
+        struct dm_ulog_request *tfr = prealloced_ulog_tfr;
+        struct receiving_pkg pkg;
+        if (data_size > (DM_ULOG_PREALLOCED_SIZE - overhead_size)) {
+                DMINFO("Size of tfr exceeds preallocated size");
+                return -EINVAL;
+        }
+        if (!rdata_size)
+                rdata_size = &dummy;
+resend:
+        /*
+         * We serialize the sending of requests so we can
+         * use the preallocated space.
+         */
+        mutex_lock(&dm_ulog_lock);
+        memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - overhead_size);
+        memcpy(tfr->uuid, uuid, DM_UUID_LEN);
+        tfr->seq = dm_ulog_seq++;
+        /*
+         * Must be valid request type (all other bits set to
+         * zero).  This reserves other bits for possible future
+         * use.
+         */
+        tfr->request_type = request_type & DM_ULOG_REQUEST_MASK;
+        tfr->data_size = data_size;
+        if (data && data_size)
+                memcpy(tfr->data, data, data_size);
+        memset(&pkg, 0, sizeof(pkg));
+        init_completion(&pkg.complete);
+        pkg.seq = tfr->seq;
+        pkg.data_size = rdata_size;
+        pkg.data = rdata;
+        spin_lock(&receiving_list_lock);
+        list_add(&(pkg.list), &receiving_list);
+        spin_unlock(&receiving_list_lock);
+        r = dm_ulog_sendto_server(tfr);
+        mutex_unlock(&dm_ulog_lock);
+        if (r) {
+                DMERR("Unable to send log request [%u] to userspace: %d",
+                      request_type, r);
+                spin_lock(&receiving_list_lock);
+                list_del_init(&(pkg.list));
+                spin_unlock(&receiving_list_lock);
+                goto out;
+        }
+        r = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT);
+        spin_lock(&receiving_list_lock);
+        list_del_init(&(pkg.list));
+        spin_unlock(&receiving_list_lock);
+        if (!r) {
+                DMWARN("[%s] Request timed out: [%u/%u] - retrying",
+                       (strlen(uuid) > 8) ?
+                       (uuid + (strlen(uuid) - 8)) : (uuid),
+                       request_type, pkg.seq);
+                goto resend;
+        }
+        r = pkg.error;
+        if (r == -EAGAIN)
+                goto resend;
+out:
+        return r;
+}
+int dm_ulog_tfr_init(void)
+{
+        int r;
+        void *prealloced;
+        INIT_LIST_HEAD(&receiving_list);
+        prealloced = kmalloc(DM_ULOG_PREALLOCED_SIZE, GFP_KERNEL);
+        if (!prealloced)
+                return -ENOMEM;
+        prealloced_cn_msg = prealloced;
+        prealloced_ulog_tfr = prealloced + sizeof(struct cn_msg);
+        r = cn_add_callback(&ulog_cn_id, "dmlogusr", cn_ulog_callback);
+        if (r) {
+                cn_del_callback(&ulog_cn_id);
+                return r;
+        }
+        return 0;
+}
+void dm_ulog_tfr_exit(void)
+{
+        cn_del_callback(&ulog_cn_id);
+        kfree(prealloced_cn_msg);
+}
diff --git a/drivers/md/dm-log-userspace-transfer.h b/drivers/md/dm-log-userspace-transfer.h
new file mode 100644
index 000000000000..c26d8e4e2710
--- /dev/null
+++ b/drivers/md/dm-log-userspace-transfer.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) 2006-2009 Red Hat, Inc.
+ *
+ * This file is released under the LGPL.
+ */
+#ifndef __DM_LOG_USERSPACE_TRANSFER_H__
+#define __DM_LOG_USERSPACE_TRANSFER_H__
+#define DM_MSG_PREFIX "dm-log-userspace"
+int dm_ulog_tfr_init(void);
+void dm_ulog_tfr_exit(void);
+int dm_consult_userspace(const char *uuid, int request_type,
+                         char *data, size_t data_size,
+                         char *rdata, size_t *rdata_size);
+#endif /* __DM_LOG_USERSPACE_TRANSFER_H__ */
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 6fa8ccf91c70..9443896ede07 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -412,11 +412,12 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
                /*
                 * Buffer holds both header and bitset.
                 */
-                buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) +
+                buf_size =
-                                       bitset_size,
+                    dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + bitset_size,
-                                       ti->limits.logical_block_size);
+                                bdev_logical_block_size(lc->header_location.
+                                                            bdev));
-                if (buf_size > dev->bdev->bd_inode->i_size) {
+                if (buf_size > i_size_read(dev->bdev->bd_inode)) {
                        DMWARN("log device %s too small: need %llu bytes",
                                dev->name, (unsigned long long)buf_size);
                        kfree(lc);
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 6a386ab4f7eb..c70604a20897 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -8,7 +8,6 @@
 #include <linux/device-mapper.h>
 #include "dm-path-selector.h"
-#include "dm-bio-record.h"
 #include "dm-uevent.h"
 #include <linux/ctype.h>
@@ -35,6 +34,7 @@ struct pgpath {
        struct dm_path path;
        struct work_struct deactivate_path;
+        struct work_struct activate_path;
 };
 #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
@@ -64,8 +64,6 @@ struct multipath {
        spinlock_t lock;
        const char *hw_handler_name;
-        struct work_struct activate_path;
-        struct pgpath *pgpath_to_activate;
        unsigned nr_priority_groups;
        struct list_head priority_groups;
        unsigned pg_init_required;      /* pg_init needs calling? */
@@ -84,7 +82,7 @@ struct multipath {
        unsigned pg_init_count;         /* Number of times pg_init called */
        struct work_struct process_queued_ios;
-        struct bio_list queued_ios;
+        struct list_head queued_ios;
        unsigned queue_size;
        struct work_struct trigger_event;
@@ -101,7 +99,7 @@ struct multipath {
 */
 struct dm_mpath_io {
        struct pgpath *pgpath;
-        struct dm_bio_details details;
+        size_t nr_bytes;
 };
 typedef int (*action_fn) (struct pgpath *pgpath);
@@ -128,6 +126,7 @@ static struct pgpath *alloc_pgpath(void)
        if (pgpath) {
                pgpath->is_active = 1;
                INIT_WORK(&pgpath->deactivate_path, deactivate_path);
+                INIT_WORK(&pgpath->activate_path, activate_path);
        }
        return pgpath;
@@ -160,7 +159,6 @@ static struct priority_group *alloc_priority_group(void)
 static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
 {
-        unsigned long flags;
        struct pgpath *pgpath, *tmp;
        struct multipath *m = ti->private;
@@ -169,10 +167,6 @@ static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
                if (m->hw_handler_name)
                        scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev));
                dm_put_device(ti, pgpath->path.dev);
-                spin_lock_irqsave(&m->lock, flags);
-                if (m->pgpath_to_activate == pgpath)
-                        m->pgpath_to_activate = NULL;
-                spin_unlock_irqrestore(&m->lock, flags);
                free_pgpath(pgpath);
        }
 }
@@ -198,11 +192,11 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
        m = kzalloc(sizeof(*m), GFP_KERNEL);
        if (m) {
                INIT_LIST_HEAD(&m->priority_groups);
+                INIT_LIST_HEAD(&m->queued_ios);
                spin_lock_init(&m->lock);
                m->queue_io = 1;
                INIT_WORK(&m->process_queued_ios, process_queued_ios);
                INIT_WORK(&m->trigger_event, trigger_event);
-                INIT_WORK(&m->activate_path, activate_path);
                m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
                if (!m->mpio_pool) {
                        kfree(m);
@@ -250,11 +244,12 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
        m->pg_init_count = 0;
 }
-static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg)
+static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg,
+                               size_t nr_bytes)
 {
        struct dm_path *path;
-        path = pg->ps.type->select_path(&pg->ps, &m->repeat_count);
+        path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes);
        if (!path)
                return -ENXIO;
@@ -266,7 +261,7 @@ static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg)
        return 0;
 }
-static void __choose_pgpath(struct multipath *m)
+static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
 {
        struct priority_group *pg;
        unsigned bypassed = 1;
@@ -278,12 +273,12 @@ static void __choose_pgpath(struct multipath *m)
        if (m->next_pg) {
                pg = m->next_pg;
                m->next_pg = NULL;
-                if (!__choose_path_in_pg(m, pg))
+                if (!__choose_path_in_pg(m, pg, nr_bytes))
                        return;
        }
        /* Don't change PG until it has no remaining paths */
-        if (m->current_pg && !__choose_path_in_pg(m, m->current_pg))
+        if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes))
                return;
        /*
@@ -295,7 +290,7 @@ static void __choose_pgpath(struct multipath *m)
                list_for_each_entry(pg, &m->priority_groups, list) {
                        if (pg->bypassed == bypassed)
                                continue;
-                        if (!__choose_path_in_pg(m, pg))
+                        if (!__choose_path_in_pg(m, pg, nr_bytes))
                                return;
                }
        } while (bypassed--);
@@ -322,19 +317,21 @@ static int __must_push_back(struct multipath *m)
                dm_noflush_suspending(m->ti));
 }
-static int map_io(struct multipath *m, struct bio *bio,
+static int map_io(struct multipath *m, struct request *clone,
                  struct dm_mpath_io *mpio, unsigned was_queued)
 {
        int r = DM_MAPIO_REMAPPED;
+        size_t nr_bytes = blk_rq_bytes(clone);
        unsigned long flags;
        struct pgpath *pgpath;
+        struct block_device *bdev;
        spin_lock_irqsave(&m->lock, flags);
        /* Do we need to select a new pgpath? */
        if (!m->current_pgpath ||
            (!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
-                __choose_pgpath(m);
+                __choose_pgpath(m, nr_bytes);
        pgpath = m->current_pgpath;
@@ -344,21 +341,28 @@ static int map_io(struct multipath *m, struct bio *bio,
        if ((pgpath && m->queue_io) ||
            (!pgpath && m->queue_if_no_path)) {
                /* Queue for the daemon to resubmit */
-                bio_list_add(&m->queued_ios, bio);
+                list_add_tail(&clone->queuelist, &m->queued_ios);
                m->queue_size++;
                if ((m->pg_init_required && !m->pg_init_in_progress) ||
                    !m->queue_io)
                        queue_work(kmultipathd, &m->process_queued_ios);
                pgpath = NULL;
                r = DM_MAPIO_SUBMITTED;
-        } else if (pgpath)
+        } else if (pgpath) {
-                bio->bi_bdev = pgpath->path.dev->bdev;
+                bdev = pgpath->path.dev->bdev;
-        else if (__must_push_back(m))
+                clone->q = bdev_get_queue(bdev);
+                clone->rq_disk = bdev->bd_disk;
+        } else if (__must_push_back(m))
                r = DM_MAPIO_REQUEUE;
        else
                r = -EIO;       /* Failed */
        mpio->pgpath = pgpath;
+        mpio->nr_bytes = nr_bytes;
+        if (r == DM_MAPIO_REMAPPED && pgpath->pg->ps.type->start_io)
+                pgpath->pg->ps.type->start_io(&pgpath->pg->ps, &pgpath->path,
+                                              nr_bytes);
        spin_unlock_irqrestore(&m->lock, flags);
@@ -396,30 +400,31 @@ static void dispatch_queued_ios(struct multipath *m)
 {
        int r;
        unsigned long flags;
-        struct bio *bio = NULL, *next;
        struct dm_mpath_io *mpio;
        union map_info *info;
+        struct request *clone, *n;
+        LIST_HEAD(cl);
        spin_lock_irqsave(&m->lock, flags);
-        bio = bio_list_get(&m->queued_ios);
+        list_splice_init(&m->queued_ios, &cl);
        spin_unlock_irqrestore(&m->lock, flags);
-        while (bio) {
+        list_for_each_entry_safe(clone, n, &cl, queuelist) {
-                next = bio->bi_next;
+                list_del_init(&clone->queuelist);
-                bio->bi_next = NULL;
-                info = dm_get_mapinfo(bio);
+                info = dm_get_rq_mapinfo(clone);
                mpio = info->ptr;
-                r = map_io(m, bio, mpio, 1);
+                r = map_io(m, clone, mpio, 1);
-                if (r < 0)
+                if (r < 0) {
-                        bio_endio(bio, r);
+                        mempool_free(mpio, m->mpio_pool);
-                else if (r == DM_MAPIO_REMAPPED)
+                        dm_kill_unmapped_request(clone, r);
-                        generic_make_request(bio);
+                } else if (r == DM_MAPIO_REMAPPED)
-                else if (r == DM_MAPIO_REQUEUE)
+                        dm_dispatch_request(clone);
-                        bio_endio(bio, -EIO);
+                else if (r == DM_MAPIO_REQUEUE) {
+                        mempool_free(mpio, m->mpio_pool);
-                bio = next;
+                        dm_requeue_unmapped_request(clone);
+                }
        }
 }
@@ -427,8 +432,8 @@ static void process_queued_ios(struct work_struct *work)
 {
        struct multipath *m =
                container_of(work, struct multipath, process_queued_ios);
-        struct pgpath *pgpath = NULL;
+        struct pgpath *pgpath = NULL, *tmp;
-        unsigned init_required = 0, must_queue = 1;
+        unsigned must_queue = 1;
        unsigned long flags;
        spin_lock_irqsave(&m->lock, flags);
@@ -437,7 +442,7 @@ static void process_queued_ios(struct work_struct *work)
                goto out;
        if (!m->current_pgpath)
-                __choose_pgpath(m);
+                __choose_pgpath(m, 0);
        pgpath = m->current_pgpath;
@@ -446,19 +451,15 @@ static void process_queued_ios(struct work_struct *work)
                must_queue = 0;
        if (m->pg_init_required && !m->pg_init_in_progress && pgpath) {
-                m->pgpath_to_activate = pgpath;
                m->pg_init_count++;
                m->pg_init_required = 0;
-                m->pg_init_in_progress = 1;
+                list_for_each_entry(tmp, &pgpath->pg->pgpaths, list) {
-                init_required = 1;
+                        if (queue_work(kmpath_handlerd, &tmp->activate_path))
+                                m->pg_init_in_progress++;
+                }
        }
 out:
        spin_unlock_irqrestore(&m->lock, flags);
-        if (init_required)
-                queue_work(kmpath_handlerd, &m->activate_path);
        if (!must_queue)
                dispatch_queued_ios(m);
 }
@@ -553,6 +554,12 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
                return -EINVAL;
        }
+        if (ps_argc > as->argc) {
+                dm_put_path_selector(pst);
+                ti->error = "not enough arguments for path selector";
+                return -EINVAL;
+        }
        r = pst->create(&pg->ps, ps_argc, as->argv);
        if (r) {
                dm_put_path_selector(pst);
@@ -591,9 +598,20 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
        }
        if (m->hw_handler_name) {
-                r = scsi_dh_attach(bdev_get_queue(p->path.dev->bdev),
+                struct request_queue *q = bdev_get_queue(p->path.dev->bdev);
-                                   m->hw_handler_name);
+                r = scsi_dh_attach(q, m->hw_handler_name);
+                if (r == -EBUSY) {
+                        /*
+                         * Already attached to different hw_handler,
+                         * try to reattach with correct one.
+                         */
+                        scsi_dh_detach(q);
+                        r = scsi_dh_attach(q, m->hw_handler_name);
+                }
                if (r < 0) {
+                        ti->error = "error attaching hardware handler";
                        dm_put_device(ti, p->path.dev);
                        goto bad;
                }
@@ -699,6 +717,11 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m)
        if (!hw_argc)
                return 0;
+        if (hw_argc > as->argc) {
+                ti->error = "not enough arguments for hardware handler";
+                return -EINVAL;
+        }
        m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL);
        request_module("scsi_dh_%s", m->hw_handler_name);
        if (scsi_dh_handler_exist(m->hw_handler_name) == 0) {
@@ -823,6 +846,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
                goto bad;
        }
+        ti->num_flush_requests = 1;
        return 0;
 bad:
@@ -836,25 +861,29 @@ static void multipath_dtr(struct dm_target *ti)
        flush_workqueue(kmpath_handlerd);
        flush_workqueue(kmultipathd);
+        flush_scheduled_work();
        free_multipath(m);
 }
 /*
- * Map bios, recording original fields for later in case we have to resubmit
+ * Map cloned requests
 */
-static int multipath_map(struct dm_target *ti, struct bio *bio,
+static int multipath_map(struct dm_target *ti, struct request *clone,
                         union map_info *map_context)
 {
        int r;
        struct dm_mpath_io *mpio;
        struct multipath *m = (struct multipath *) ti->private;
-        mpio = mempool_alloc(m->mpio_pool, GFP_NOIO);
+        mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
-        dm_bio_record(&mpio->details, bio);
+        if (!mpio)
+                /* ENOMEM, requeue */
+                return DM_MAPIO_REQUEUE;
+        memset(mpio, 0, sizeof(*mpio));
        map_context->ptr = mpio;
-        bio->bi_rw |= (1 << BIO_RW_FAILFAST_TRANSPORT);
+        clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
-        r = map_io(m, bio, mpio, 0);
+        r = map_io(m, clone, mpio, 0);
        if (r < 0 || r == DM_MAPIO_REQUEUE)
                mempool_free(mpio, m->mpio_pool);
@@ -924,9 +953,13 @@ static int reinstate_path(struct pgpath *pgpath)
        pgpath->is_active = 1;
-        m->current_pgpath = NULL;
+        if (!m->nr_valid_paths++ && m->queue_size) {
-        if (!m->nr_valid_paths++ && m->queue_size)
+                m->current_pgpath = NULL;
                queue_work(kmultipathd, &m->process_queued_ios);
+        } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
+                if (queue_work(kmpath_handlerd, &pgpath->activate_path))
+                        m->pg_init_in_progress++;
+        }
        dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
                      pgpath->path.dev->name, m->nr_valid_paths);
@@ -1102,87 +1135,70 @@ static void pg_init_done(struct dm_path *path, int errors)
        spin_lock_irqsave(&m->lock, flags);
        if (errors) {
-                DMERR("Could not failover device. Error %d.", errors);
+                if (pgpath == m->current_pgpath) {
-                m->current_pgpath = NULL;
+                        DMERR("Could not failover device. Error %d.", errors);
-                m->current_pg = NULL;
+                        m->current_pgpath = NULL;
+                        m->current_pg = NULL;
+                }
        } else if (!m->pg_init_required) {
                m->queue_io = 0;
                pg->bypassed = 0;
        }
-        m->pg_init_in_progress = 0;
+        m->pg_init_in_progress--;
-        queue_work(kmultipathd, &m->process_queued_ios);
+        if (!m->pg_init_in_progress)
+                queue_work(kmultipathd, &m->process_queued_ios);
        spin_unlock_irqrestore(&m->lock, flags);
 }
 static void activate_path(struct work_struct *work)
 {
        int ret;
-        struct multipath *m =
+        struct pgpath *pgpath =
-                container_of(work, struct multipath, activate_path);
+                container_of(work, struct pgpath, activate_path);
-        struct dm_path *path;
-        unsigned long flags;
-        spin_lock_irqsave(&m->lock, flags);
+        ret = scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev));
-        path = &m->pgpath_to_activate->path;
+        pg_init_done(&pgpath->path, ret);
-        m->pgpath_to_activate = NULL;
-        spin_unlock_irqrestore(&m->lock, flags);
-        if (!path)
-                return;
-        ret = scsi_dh_activate(bdev_get_queue(path->dev->bdev));
-        pg_init_done(path, ret);
 }
 /*
 * end_io handling
 */
-static int do_end_io(struct multipath *m, struct bio *bio,
+static int do_end_io(struct multipath *m, struct request *clone,
                     int error, struct dm_mpath_io *mpio)
 {
+        /*
+         * We don't queue any clone request inside the multipath target
+         * during end I/O handling, since those clone requests don't have
+         * bio clones.  If we queue them inside the multipath target,
+         * we need to make bio clones, that requires memory allocation.
+         * (See drivers/md/dm.c:end_clone_bio() about why the clone requests
+         *  don't have bio clones.)
+         * Instead of queueing the clone request here, we queue the original
+         * request into dm core, which will remake a clone request and
+         * clone bios for it and resubmit it later.
+         */
+        int r = DM_ENDIO_REQUEUE;
        unsigned long flags;
-        if (!error)
+        if (!error && !clone->errors)
                return 0;       /* I/O complete */
-        if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
-                return error;
        if (error == -EOPNOTSUPP)
                return error;
-        spin_lock_irqsave(&m->lock, flags);
-        if (!m->nr_valid_paths) {
-                if (__must_push_back(m)) {
-                        spin_unlock_irqrestore(&m->lock, flags);
-                        return DM_ENDIO_REQUEUE;
-                } else if (!m->queue_if_no_path) {
-                        spin_unlock_irqrestore(&m->lock, flags);
-                        return -EIO;
-                } else {
-                        spin_unlock_irqrestore(&m->lock, flags);
-                        goto requeue;
-                }
-        }
-        spin_unlock_irqrestore(&m->lock, flags);
        if (mpio->pgpath)
                fail_path(mpio->pgpath);
-      requeue:
-        dm_bio_restore(&mpio->details, bio);
-        /* queue for the daemon to resubmit or fail */
        spin_lock_irqsave(&m->lock, flags);
-        bio_list_add(&m->queued_ios, bio);
+        if (!m->nr_valid_paths && !m->queue_if_no_path && !__must_push_back(m))
-        m->queue_size++;
+                r = -EIO;
-        if (!m->queue_io)
-                queue_work(kmultipathd, &m->process_queued_ios);
        spin_unlock_irqrestore(&m->lock, flags);
-        return DM_ENDIO_INCOMPLETE;     /* io not complete */
+        return r;
 }
-static int multipath_end_io(struct dm_target *ti, struct bio *bio,
+static int multipath_end_io(struct dm_target *ti, struct request *clone,
                            int error, union map_info *map_context)
 {
        struct multipath *m = ti->private;
@@ -1191,14 +1207,13 @@ static int multipath_end_io(struct dm_target *ti, struct bio *bio,
        struct path_selector *ps;
        int r;
-        r  = do_end_io(m, bio, error, mpio);
+        r  = do_end_io(m, clone, error, mpio);
        if (pgpath) {
                ps = &pgpath->pg->ps;
                if (ps->type->end_io)
-                        ps->type->end_io(ps, &pgpath->path);
+                        ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
        }
-        if (r != DM_ENDIO_INCOMPLETE)
+        mempool_free(mpio, m->mpio_pool);
-                mempool_free(mpio, m->mpio_pool);
        return r;
 }
@@ -1411,7 +1426,7 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
        spin_lock_irqsave(&m->lock, flags);
        if (!m->current_pgpath)
-                __choose_pgpath(m);
+                __choose_pgpath(m, 0);
        if (m->current_pgpath) {
                bdev = m->current_pgpath->path.dev->bdev;
@@ -1428,22 +1443,113 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
        return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
 }
+static int multipath_iterate_devices(struct dm_target *ti,
+                                     iterate_devices_callout_fn fn, void *data)
+{
+        struct multipath *m = ti->private;
+        struct priority_group *pg;
+        struct pgpath *p;
+        int ret = 0;
+        list_for_each_entry(pg, &m->priority_groups, list) {
+                list_for_each_entry(p, &pg->pgpaths, list) {
+                        ret = fn(ti, p->path.dev, ti->begin, data);
+                        if (ret)
+                                goto out;
+                }
+        }
+out:
+        return ret;
+}
+static int __pgpath_busy(struct pgpath *pgpath)
+{
+        struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
+        return dm_underlying_device_busy(q);
+}
+/*
+ * We return "busy", only when we can map I/Os but underlying devices
+ * are busy (so even if we map I/Os now, the I/Os will wait on
+ * the underlying queue).
+ * In other words, if we want to kill I/Os or queue them inside us
+ * due to map unavailability, we don't return "busy".  Otherwise,
+ * dm core won't give us the I/Os and we can't do what we want.
+ */
+static int multipath_busy(struct dm_target *ti)
+{
+        int busy = 0, has_active = 0;
+        struct multipath *m = ti->private;
+        struct priority_group *pg;
+        struct pgpath *pgpath;
+        unsigned long flags;
+        spin_lock_irqsave(&m->lock, flags);
+        /* Guess which priority_group will be used at next mapping time */
+        if (unlikely(!m->current_pgpath && m->next_pg))
+                pg = m->next_pg;
+        else if (likely(m->current_pg))
+                pg = m->current_pg;
+        else
+                /*
+                 * We don't know which pg will be used at next mapping time.
+                 * We don't call __choose_pgpath() here to avoid to trigger
+                 * pg_init just by busy checking.
+                 * So we don't know whether underlying devices we will be using
+                 * at next mapping time are busy or not. Just try mapping.
+                 */
+                goto out;
+        /*
+         * If there is one non-busy active path at least, the path selector
+         * will be able to select it. So we consider such a pg as not busy.
+         */
+        busy = 1;
+        list_for_each_entry(pgpath, &pg->pgpaths, list)
+                if (pgpath->is_active) {
+                        has_active = 1;
+                        if (!__pgpath_busy(pgpath)) {
+                                busy = 0;
+                                break;
+                        }
+                }
+        if (!has_active)
+                /*
+                 * No active path in this pg, so this pg won't be used and
+                 * the current_pg will be changed at next mapping time.
+                 * We need to try mapping to determine it.
+                 */
+                busy = 0;
+out:
+        spin_unlock_irqrestore(&m->lock, flags);
+        return busy;
+}
 /*-----------------------------------------------------------------
 * Module setup
 *---------------------------------------------------------------*/
 static struct target_type multipath_target = {
        .name = "multipath",
-        .version = {1, 0, 5},
+        .version = {1, 1, 0},
        .module = THIS_MODULE,
        .ctr = multipath_ctr,
        .dtr = multipath_dtr,
-        .map = multipath_map,
+        .map_rq = multipath_map,
-        .end_io = multipath_end_io,
+        .rq_end_io = multipath_end_io,
        .presuspend = multipath_presuspend,
        .resume = multipath_resume,
        .status = multipath_status,
        .message = multipath_message,
        .ioctl  = multipath_ioctl,
+        .iterate_devices = multipath_iterate_devices,
+        .busy = multipath_busy,
 };
 static int __init dm_multipath_init(void)
diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h
index 27357b85d73d..e7d1fa8b0459 100644
--- a/drivers/md/dm-path-selector.h
+++ b/drivers/md/dm-path-selector.h
@@ -56,7 +56,8 @@ struct path_selector_type {
         * the path fails.
         */
        struct dm_path *(*select_path) (struct path_selector *ps,
-                                     unsigned *repeat_count);
+                                        unsigned *repeat_count,
+                                        size_t nr_bytes);
        /*
         * Notify the selector that a path has failed.
@@ -75,7 +76,10 @@ struct path_selector_type {
        int (*status) (struct path_selector *ps, struct dm_path *path,
                       status_type_t type, char *result, unsigned int maxlen);
-        int (*end_io) (struct path_selector *ps, struct dm_path *path);
+        int (*start_io) (struct path_selector *ps, struct dm_path *path,
+                         size_t nr_bytes);
+        int (*end_io) (struct path_selector *ps, struct dm_path *path,
+                       size_t nr_bytes);
 };
 /* Register a path selector */
diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c
new file mode 100644
index 000000000000..f92b6cea9d9c
--- /dev/null
+++ b/drivers/md/dm-queue-length.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright (C) 2004-2005 IBM Corp.  All Rights Reserved.
+ * Copyright (C) 2006-2009 NEC Corporation.
+ *
+ * dm-queue-length.c
+ *
+ * Module Author: Stefan Bader, IBM
+ * Modified by: Kiyoshi Ueda, NEC
+ *
+ * This file is released under the GPL.
+ *
+ * queue-length path selector - choose a path with the least number of
+ * in-flight I/Os.
+ */
+#include "dm.h"
+#include "dm-path-selector.h"
+#include <linux/slab.h>
+#include <linux/ctype.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <asm/atomic.h>
+#define DM_MSG_PREFIX   "multipath queue-length"
+#define QL_MIN_IO       128
+#define QL_VERSION      "0.1.0"
+struct selector {
+        struct list_head        valid_paths;
+        struct list_head        failed_paths;
+};
+struct path_info {
+        struct list_head        list;
+        struct dm_path          *path;
+        unsigned                repeat_count;
+        atomic_t                qlen;   /* the number of in-flight I/Os */
+};
+static struct selector *alloc_selector(void)
+{
+        struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
+        if (s) {
+                INIT_LIST_HEAD(&s->valid_paths);
+                INIT_LIST_HEAD(&s->failed_paths);
+        }
+        return s;
+}
+static int ql_create(struct path_selector *ps, unsigned argc, char **argv)
+{
+        struct selector *s = alloc_selector();
+        if (!s)
+                return -ENOMEM;
+        ps->context = s;
+        return 0;
+}
+static void ql_free_paths(struct list_head *paths)
+{
+        struct path_info *pi, *next;
+        list_for_each_entry_safe(pi, next, paths, list) {
+                list_del(&pi->list);
+                kfree(pi);
+        }
+}
+static void ql_destroy(struct path_selector *ps)
+{
+        struct selector *s = ps->context;
+        ql_free_paths(&s->valid_paths);
+        ql_free_paths(&s->failed_paths);
+        kfree(s);
+        ps->context = NULL;
+}
+static int ql_status(struct path_selector *ps, struct dm_path *path,
+                     status_type_t type, char *result, unsigned maxlen)
+{
+        unsigned sz = 0;
+        struct path_info *pi;
+        /* When called with NULL path, return selector status/args. */
+        if (!path)
+                DMEMIT("0 ");
+        else {
+                pi = path->pscontext;
+                switch (type) {
+                case STATUSTYPE_INFO:
+                        DMEMIT("%d ", atomic_read(&pi->qlen));
+                        break;
+                case STATUSTYPE_TABLE:
+                        DMEMIT("%u ", pi->repeat_count);
+                        break;
+                }
+        }
+        return sz;
+}
+static int ql_add_path(struct path_selector *ps, struct dm_path *path,
+                       int argc, char **argv, char **error)
+{
+        struct selector *s = ps->context;
+        struct path_info *pi;
+        unsigned repeat_count = QL_MIN_IO;
+        /*
+         * Arguments: [<repeat_count>]
+         *      <repeat_count>: The number of I/Os before switching path.
+         *                      If not given, default (QL_MIN_IO) is used.
+         */
+        if (argc > 1) {
+                *error = "queue-length ps: incorrect number of arguments";
+                return -EINVAL;
+        }
+        if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
+                *error = "queue-length ps: invalid repeat count";
+                return -EINVAL;
+        }
+        /* Allocate the path information structure */
+        pi = kmalloc(sizeof(*pi), GFP_KERNEL);
+        if (!pi) {
+                *error = "queue-length ps: Error allocating path information";
+                return -ENOMEM;
+        }
+        pi->path = path;
+        pi->repeat_count = repeat_count;
+        atomic_set(&pi->qlen, 0);
+        path->pscontext = pi;
+        list_add_tail(&pi->list, &s->valid_paths);
+        return 0;
+}
+static void ql_fail_path(struct path_selector *ps, struct dm_path *path)
+{
+        struct selector *s = ps->context;
+        struct path_info *pi = path->pscontext;
+        list_move(&pi->list, &s->failed_paths);
+}
+static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path)
+{
+        struct selector *s = ps->context;
+        struct path_info *pi = path->pscontext;
+        list_move_tail(&pi->list, &s->valid_paths);
+        return 0;
+}
+/*
+ * Select a path having the minimum number of in-flight I/Os
+ */
+static struct dm_path *ql_select_path(struct path_selector *ps,
+                                      unsigned *repeat_count, size_t nr_bytes)
+{
+        struct selector *s = ps->context;
+        struct path_info *pi = NULL, *best = NULL;
+        if (list_empty(&s->valid_paths))
+                return NULL;
+        /* Change preferred (first in list) path to evenly balance. */
+        list_move_tail(s->valid_paths.next, &s->valid_paths);
+        list_for_each_entry(pi, &s->valid_paths, list) {
+                if (!best ||
+                    (atomic_read(&pi->qlen) < atomic_read(&best->qlen)))
+                        best = pi;
+                if (!atomic_read(&best->qlen))
+                        break;
+        }
+        if (!best)
+                return NULL;
+        *repeat_count = best->repeat_count;
+        return best->path;
+}
+static int ql_start_io(struct path_selector *ps, struct dm_path *path,
+                       size_t nr_bytes)
+{
+        struct path_info *pi = path->pscontext;
+        atomic_inc(&pi->qlen);
+        return 0;
+}
+static int ql_end_io(struct path_selector *ps, struct dm_path *path,
+                     size_t nr_bytes)
+{
+        struct path_info *pi = path->pscontext;
+        atomic_dec(&pi->qlen);
+        return 0;
+}
+static struct path_selector_type ql_ps = {
+        .name           = "queue-length",
+        .module         = THIS_MODULE,
+        .table_args     = 1,
+        .info_args      = 1,
+        .create         = ql_create,
+        .destroy        = ql_destroy,
+        .status         = ql_status,
+        .add_path       = ql_add_path,
+        .fail_path      = ql_fail_path,
+        .reinstate_path = ql_reinstate_path,
+        .select_path    = ql_select_path,
+        .start_io       = ql_start_io,
+        .end_io         = ql_end_io,
+};
+static int __init dm_ql_init(void)
+{
+        int r = dm_register_path_selector(&ql_ps);
+        if (r < 0)
+                DMERR("register failed %d", r);
+        DMINFO("version " QL_VERSION " loaded");
+        return r;
+}
+static void __exit dm_ql_exit(void)
+{
+        int r = dm_unregister_path_selector(&ql_ps);
+        if (r < 0)
+                DMERR("unregister failed %d", r);
+}
+module_init(dm_ql_init);
+module_exit(dm_ql_exit);
+MODULE_AUTHOR("Stefan Bader <Stefan.Bader at de.ibm.com>");
+MODULE_DESCRIPTION(
+        "(C) Copyright IBM Corp. 2004,2005   All Rights Reserved.\n"
+        DM_NAME " path selector to balance the number of in-flight I/Os"
+);
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 076fbb4e967a..ce8868c768cc 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1283,9 +1283,23 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
        return 0;
 }
+static int mirror_iterate_devices(struct dm_target *ti,
+                                  iterate_devices_callout_fn fn, void *data)
+{
+        struct mirror_set *ms = ti->private;
+        int ret = 0;
+        unsigned i;
+        for (i = 0; !ret && i < ms->nr_mirrors; i++)
+                ret = fn(ti, ms->mirror[i].dev,
+                         ms->mirror[i].offset, data);
+        return ret;
+}
 static struct target_type mirror_target = {
        .name    = "mirror",
-        .version = {1, 0, 20},
+        .version = {1, 12, 0},
        .module  = THIS_MODULE,
        .ctr     = mirror_ctr,
        .dtr     = mirror_dtr,
@@ -1295,6 +1309,7 @@ static struct target_type mirror_target = {
        .postsuspend = mirror_postsuspend,
        .resume  = mirror_resume,
        .status  = mirror_status,
+        .iterate_devices = mirror_iterate_devices,
 };
 static int __init dm_mirror_init(void)
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 7b899be0b087..36dbe29f2fd6 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -283,7 +283,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region)
        nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
        if (unlikely(!nreg))
-                nreg = kmalloc(sizeof(*nreg), GFP_NOIO);
+                nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL);
        nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
                      DM_RH_CLEAN : DM_RH_NOSYNC;
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
index cdfbf65b28cb..24752f449bef 100644
--- a/drivers/md/dm-round-robin.c
+++ b/drivers/md/dm-round-robin.c
@@ -161,7 +161,7 @@ static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p)
 }
 static struct dm_path *rr_select_path(struct path_selector *ps,
-                                   unsigned *repeat_count)
+                                      unsigned *repeat_count, size_t nr_bytes)
 {
        struct selector *s = (struct selector *) ps->context;
        struct path_info *pi = NULL;
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c
new file mode 100644
index 000000000000..cfa668f46c40
--- /dev/null
+++ b/drivers/md/dm-service-time.c
@@ -0,0 +1,339 @@
+/*
+ * Copyright (C) 2007-2009 NEC Corporation.  All Rights Reserved.
+ *
+ * Module Author: Kiyoshi Ueda
+ *
+ * This file is released under the GPL.
+ *
+ * Throughput oriented path selector.
+ */
+#include "dm.h"
+#include "dm-path-selector.h"
+#define DM_MSG_PREFIX   "multipath service-time"
+#define ST_MIN_IO       1
+#define ST_MAX_RELATIVE_THROUGHPUT      100
+#define ST_MAX_RELATIVE_THROUGHPUT_SHIFT        7
+#define ST_MAX_INFLIGHT_SIZE    ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT)
+#define ST_VERSION      "0.2.0"
+struct selector {
+        struct list_head valid_paths;
+        struct list_head failed_paths;
+};
+struct path_info {
+        struct list_head list;
+        struct dm_path *path;
+        unsigned repeat_count;
+        unsigned relative_throughput;
+        atomic_t in_flight_size;        /* Total size of in-flight I/Os */
+};
+static struct selector *alloc_selector(void)
+{
+        struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
+        if (s) {
+                INIT_LIST_HEAD(&s->valid_paths);
+                INIT_LIST_HEAD(&s->failed_paths);
+        }
+        return s;
+}
+static int st_create(struct path_selector *ps, unsigned argc, char **argv)
+{
+        struct selector *s = alloc_selector();
+        if (!s)
+                return -ENOMEM;
+        ps->context = s;
+        return 0;
+}
+static void free_paths(struct list_head *paths)
+{
+        struct path_info *pi, *next;
+        list_for_each_entry_safe(pi, next, paths, list) {
+                list_del(&pi->list);
+                kfree(pi);
+        }
+}
+static void st_destroy(struct path_selector *ps)
+{
+        struct selector *s = ps->context;
+        free_paths(&s->valid_paths);
+        free_paths(&s->failed_paths);
+        kfree(s);
+        ps->context = NULL;
+}
+static int st_status(struct path_selector *ps, struct dm_path *path,
+                     status_type_t type, char *result, unsigned maxlen)
+{
+        unsigned sz = 0;
+        struct path_info *pi;
+        if (!path)
+                DMEMIT("0 ");
+        else {
+                pi = path->pscontext;
+                switch (type) {
+                case STATUSTYPE_INFO:
+                        DMEMIT("%d %u ", atomic_read(&pi->in_flight_size),
+                               pi->relative_throughput);
+                        break;
+                case STATUSTYPE_TABLE:
+                        DMEMIT("%u %u ", pi->repeat_count,
+                               pi->relative_throughput);
+                        break;
+                }
+        }
+        return sz;
+}
+static int st_add_path(struct path_selector *ps, struct dm_path *path,
+                       int argc, char **argv, char **error)
+{
+        struct selector *s = ps->context;
+        struct path_info *pi;
+        unsigned repeat_count = ST_MIN_IO;
+        unsigned relative_throughput = 1;
+        /*
+         * Arguments: [<repeat_count> [<relative_throughput>]]
+         *      <repeat_count>: The number of I/Os before switching path.
+         *                      If not given, default (ST_MIN_IO) is used.
+         *      <relative_throughput>: The relative throughput value of
+         *                      the path among all paths in the path-group.
+         *                      The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT>
+         *                      If not given, minimum value '1' is used.
+         *                      If '0' is given, the path isn't selected while
+         *                      other paths having a positive value are
+         *                      available.
+         */
+        if (argc > 2) {
+                *error = "service-time ps: incorrect number of arguments";
+                return -EINVAL;
+        }
+        if (argc && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
+                *error = "service-time ps: invalid repeat count";
+                return -EINVAL;
+        }
+        if ((argc == 2) &&
+            (sscanf(argv[1], "%u", &relative_throughput) != 1 ||
+             relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) {
+                *error = "service-time ps: invalid relative_throughput value";
+                return -EINVAL;
+        }
+        /* allocate the path */
+        pi = kmalloc(sizeof(*pi), GFP_KERNEL);
+        if (!pi) {
+                *error = "service-time ps: Error allocating path context";
+                return -ENOMEM;
+        }
+        pi->path = path;
+        pi->repeat_count = repeat_count;
+        pi->relative_throughput = relative_throughput;
+        atomic_set(&pi->in_flight_size, 0);
+        path->pscontext = pi;
+        list_add_tail(&pi->list, &s->valid_paths);
+        return 0;
+}
+static void st_fail_path(struct path_selector *ps, struct dm_path *path)
+{
+        struct selector *s = ps->context;
+        struct path_info *pi = path->pscontext;
+        list_move(&pi->list, &s->failed_paths);
+}
+static int st_reinstate_path(struct path_selector *ps, struct dm_path *path)
+{
+        struct selector *s = ps->context;
+        struct path_info *pi = path->pscontext;
+        list_move_tail(&pi->list, &s->valid_paths);
+        return 0;
+}
+/*
+ * Compare the estimated service time of 2 paths, pi1 and pi2,
+ * for the incoming I/O.
+ *
+ * Returns:
+ * < 0 : pi1 is better
+ * 0   : no difference between pi1 and pi2
+ * > 0 : pi2 is better
+ *
+ * Description:
+ * Basically, the service time is estimated by:
+ *     ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput'
+ * To reduce the calculation, some optimizations are made.
+ * (See comments inline)
+ */
+static int st_compare_load(struct path_info *pi1, struct path_info *pi2,
+                           size_t incoming)
+{
+        size_t sz1, sz2, st1, st2;
+        sz1 = atomic_read(&pi1->in_flight_size);
+        sz2 = atomic_read(&pi2->in_flight_size);
+        /*
+         * Case 1: Both have same throughput value. Choose less loaded path.
+         */
+        if (pi1->relative_throughput == pi2->relative_throughput)
+                return sz1 - sz2;
+        /*
+         * Case 2a: Both have same load. Choose higher throughput path.
+         * Case 2b: One path has no throughput value. Choose the other one.
+         */
+        if (sz1 == sz2 ||
+            !pi1->relative_throughput || !pi2->relative_throughput)
+                return pi2->relative_throughput - pi1->relative_throughput;
+        /*
+         * Case 3: Calculate service time. Choose faster path.
+         *         Service time using pi1:
+         *             st1 = (sz1 + incoming) / pi1->relative_throughput
+         *         Service time using pi2:
+         *             st2 = (sz2 + incoming) / pi2->relative_throughput
+         *
+         *         To avoid the division, transform the expression to use
+         *         multiplication.
+         *         Because ->relative_throughput > 0 here, if st1 < st2,
+         *         the expressions below are the same meaning:
+         *             (sz1 + incoming) / pi1->relative_throughput <
+         *                 (sz2 + incoming) / pi2->relative_throughput
+         *             (sz1 + incoming) * pi2->relative_throughput <
+         *                 (sz2 + incoming) * pi1->relative_throughput
+         *         So use the later one.
+         */
+        sz1 += incoming;
+        sz2 += incoming;
+        if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE ||
+                     sz2 >= ST_MAX_INFLIGHT_SIZE)) {
+                /*
+                 * Size may be too big for multiplying pi->relative_throughput
+                 * and overflow.
+                 * To avoid the overflow and mis-selection, shift down both.
+                 */
+                sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
+                sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
+        }
+        st1 = sz1 * pi2->relative_throughput;
+        st2 = sz2 * pi1->relative_throughput;
+        if (st1 != st2)
+                return st1 - st2;
+        /*
+         * Case 4: Service time is equal. Choose higher throughput path.
+         */
+        return pi2->relative_throughput - pi1->relative_throughput;
+}
+static struct dm_path *st_select_path(struct path_selector *ps,
+                                      unsigned *repeat_count, size_t nr_bytes)
+{
+        struct selector *s = ps->context;
+        struct path_info *pi = NULL, *best = NULL;
+        if (list_empty(&s->valid_paths))
+                return NULL;
+        /* Change preferred (first in list) path to evenly balance. */
+        list_move_tail(s->valid_paths.next, &s->valid_paths);
+        list_for_each_entry(pi, &s->valid_paths, list)
+                if (!best || (st_compare_load(pi, best, nr_bytes) < 0))
+                        best = pi;
+        if (!best)
+                return NULL;
+        *repeat_count = best->repeat_count;
+        return best->path;
+}
+static int st_start_io(struct path_selector *ps, struct dm_path *path,
+                       size_t nr_bytes)
+{
+        struct path_info *pi = path->pscontext;
+        atomic_add(nr_bytes, &pi->in_flight_size);
+        return 0;
+}
+static int st_end_io(struct path_selector *ps, struct dm_path *path,
+                     size_t nr_bytes)
+{
+        struct path_info *pi = path->pscontext;
+        atomic_sub(nr_bytes, &pi->in_flight_size);
+        return 0;
+}
+static struct path_selector_type st_ps = {
+        .name           = "service-time",
+        .module         = THIS_MODULE,
+        .table_args     = 2,
+        .info_args      = 2,
+        .create         = st_create,
+        .destroy        = st_destroy,
+        .status         = st_status,
+        .add_path       = st_add_path,
+        .fail_path      = st_fail_path,
+        .reinstate_path = st_reinstate_path,
+        .select_path    = st_select_path,
+        .start_io       = st_start_io,
+        .end_io         = st_end_io,
+};
+static int __init dm_st_init(void)
+{
+        int r = dm_register_path_selector(&st_ps);
+        if (r < 0)
+                DMERR("register failed %d", r);
+        DMINFO("version " ST_VERSION " loaded");
+        return r;
+}
+static void __exit dm_st_exit(void)
+{
+        int r = dm_unregister_path_selector(&st_ps);
+        if (r < 0)
+                DMERR("unregister failed %d", r);
+}
+module_init(dm_st_init);
+module_exit(dm_st_exit);
+MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector");
+MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 2662a41337e7..6e3fe4f14934 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -636,7 +636,7 @@ static void persistent_commit_exception(struct dm_exception_store *store,
        /*
         * Commit exceptions to disk.
         */
-        if (ps->valid && area_io(ps, WRITE))
+        if (ps->valid && area_io(ps, WRITE_BARRIER))
                ps->valid = 0;
        /*
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index d73f17fc7778..d573165cd2b7 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -678,6 +678,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        ti->private = s;
        ti->split_io = s->store->chunk_size;
+        ti->num_flush_requests = 1;
        return 0;
@@ -1030,6 +1031,11 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
        chunk_t chunk;
        struct dm_snap_pending_exception *pe = NULL;
+        if (unlikely(bio_empty_barrier(bio))) {
+                bio->bi_bdev = s->store->cow->bdev;
+                return DM_MAPIO_REMAPPED;
+        }
        chunk = sector_to_chunk(s->store, bio->bi_sector);
        /* Full snapshots are not usable */
@@ -1338,6 +1344,8 @@ static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        }
        ti->private = dev;
+        ti->num_flush_requests = 1;
        return 0;
 }
@@ -1353,6 +1361,9 @@ static int origin_map(struct dm_target *ti, struct bio *bio,
        struct dm_dev *dev = ti->private;
        bio->bi_bdev = dev->bdev;
+        if (unlikely(bio_empty_barrier(bio)))
+                return DM_MAPIO_REMAPPED;
        /* Only tell snapshots if this is a write */
        return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED;
 }
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 41569bc60abc..b240e85ae39a 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -167,6 +167,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        sc->stripes = stripes;
        sc->stripe_width = width;
        ti->split_io = chunk_size;
+        ti->num_flush_requests = stripes;
        sc->chunk_mask = ((sector_t) chunk_size) - 1;
        for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++)
@@ -211,10 +212,18 @@ static int stripe_map(struct dm_target *ti, struct bio *bio,
                      union map_info *map_context)
 {
        struct stripe_c *sc = (struct stripe_c *) ti->private;
+        sector_t offset, chunk;
+        uint32_t stripe;
-        sector_t offset = bio->bi_sector - ti->begin;
+        if (unlikely(bio_empty_barrier(bio))) {
-        sector_t chunk = offset >> sc->chunk_shift;
+                BUG_ON(map_context->flush_request >= sc->stripes);
-        uint32_t stripe = sector_div(chunk, sc->stripes);
+                bio->bi_bdev = sc->stripe[map_context->flush_request].dev->bdev;
+                return DM_MAPIO_REMAPPED;
+        }
+        offset = bio->bi_sector - ti->begin;
+        chunk = offset >> sc->chunk_shift;
+        stripe = sector_div(chunk, sc->stripes);
        bio->bi_bdev = sc->stripe[stripe].dev->bdev;
        bio->bi_sector = sc->stripe[stripe].physical_start +
@@ -304,15 +313,31 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio,
        return error;
 }
+static int stripe_iterate_devices(struct dm_target *ti,
+                                  iterate_devices_callout_fn fn, void *data)
+{
+        struct stripe_c *sc = ti->private;
+        int ret = 0;
+        unsigned i = 0;
+        do
+                ret = fn(ti, sc->stripe[i].dev,
+                         sc->stripe[i].physical_start, data);
+        while (!ret && ++i < sc->stripes);
+        return ret;
+}
 static struct target_type stripe_target = {
        .name   = "striped",
-        .version = {1, 1, 0},
+        .version = {1, 2, 0},
        .module = THIS_MODULE,
        .ctr    = stripe_ctr,
        .dtr    = stripe_dtr,
        .map    = stripe_map,
        .end_io = stripe_end_io,
        .status = stripe_status,
+        .iterate_devices = stripe_iterate_devices,
 };
 int __init dm_stripe_init(void)
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index a2a45e6c7c8b..4b045903a4e2 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -57,12 +57,21 @@ static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf)
        return strlen(buf);
 }
+static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf)
+{
+        sprintf(buf, "%d\n", dm_suspended(md));
+        return strlen(buf);
+}
 static DM_ATTR_RO(name);
 static DM_ATTR_RO(uuid);
+static DM_ATTR_RO(suspended);
 static struct attribute *dm_attrs[] = {
        &dm_attr_name.attr,
        &dm_attr_uuid.attr,
+        &dm_attr_suspended.attr,
        NULL,
 };
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index e9a73bb242b0..4899ebe767c8 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -41,6 +41,7 @@
 struct dm_table {
        struct mapped_device *md;
        atomic_t holders;
+        unsigned type;
        /* btree table */
        unsigned int depth;
@@ -62,15 +63,11 @@ struct dm_table {
        /* a list of devices used by this table */
        struct list_head devices;
-        /*
-         * These are optimistic limits taken from all the
-         * targets, some targets will need smaller limits.
-         */
-        struct io_restrictions limits;
        /* events get handed up using this callback */
        void (*event_fn)(void *);
        void *event_context;
+        struct dm_md_mempools *mempools;
 };
 /*
@@ -89,43 +86,6 @@ static unsigned int int_log(unsigned int n, unsigned int base)
 }
 /*
- * Returns the minimum that is _not_ zero, unless both are zero.
- */
-#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
-/*
- * Combine two io_restrictions, always taking the lower value.
- */
-static void combine_restrictions_low(struct io_restrictions *lhs,
-                                     struct io_restrictions *rhs)
-{
-        lhs->max_sectors =
-                min_not_zero(lhs->max_sectors, rhs->max_sectors);
-        lhs->max_phys_segments =
-                min_not_zero(lhs->max_phys_segments, rhs->max_phys_segments);
-        lhs->max_hw_segments =
-                min_not_zero(lhs->max_hw_segments, rhs->max_hw_segments);
-        lhs->logical_block_size = max(lhs->logical_block_size,
-                                      rhs->logical_block_size);
-        lhs->max_segment_size =
-                min_not_zero(lhs->max_segment_size, rhs->max_segment_size);
-        lhs->max_hw_sectors =
-                min_not_zero(lhs->max_hw_sectors, rhs->max_hw_sectors);
-        lhs->seg_boundary_mask =
-                min_not_zero(lhs->seg_boundary_mask, rhs->seg_boundary_mask);
-        lhs->bounce_pfn = min_not_zero(lhs->bounce_pfn, rhs->bounce_pfn);
-        lhs->no_cluster |= rhs->no_cluster;
-}
-/*
 * Calculate the index of the child node of the n'th node k'th key.
 */
 static inline unsigned int get_child(unsigned int n, unsigned int k)
@@ -267,6 +227,8 @@ static void free_devices(struct list_head *devices)
        list_for_each_safe(tmp, next, devices) {
                struct dm_dev_internal *dd =
                    list_entry(tmp, struct dm_dev_internal, list);
+                DMWARN("dm_table_destroy: dm_put_device call missing for %s",
+                       dd->dm_dev.name);
                kfree(dd);
        }
 }
@@ -296,12 +258,10 @@ void dm_table_destroy(struct dm_table *t)
        vfree(t->highs);
        /* free the device list */
-        if (t->devices.next != &t->devices) {
+        if (t->devices.next != &t->devices)
-                DMWARN("devices still present during destroy: "
-                       "dm_table_remove_device calls missing");
                free_devices(&t->devices);
-        }
+        dm_free_md_mempools(t->mempools);
        kfree(t);
 }
@@ -385,15 +345,48 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
 /*
 * If possible, this checks an area of a destination device is valid.
 */
-static int check_device_area(struct dm_dev_internal *dd, sector_t start,
+static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev,
-                             sector_t len)
+                                sector_t start, void *data)
 {
-        sector_t dev_size = dd->dm_dev.bdev->bd_inode->i_size >> SECTOR_SHIFT;
+        struct queue_limits *limits = data;
+        struct block_device *bdev = dev->bdev;
+        sector_t dev_size =
+                i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
+        unsigned short logical_block_size_sectors =
+                limits->logical_block_size >> SECTOR_SHIFT;
+        char b[BDEVNAME_SIZE];
        if (!dev_size)
                return 1;
-        return ((start < dev_size) && (len <= (dev_size - start)));
+        if ((start >= dev_size) || (start + ti->len > dev_size)) {
+                DMWARN("%s: %s too small for target",
+                       dm_device_name(ti->table->md), bdevname(bdev, b));
+                return 0;
+        }
+        if (logical_block_size_sectors <= 1)
+                return 1;
+        if (start & (logical_block_size_sectors - 1)) {
+                DMWARN("%s: start=%llu not aligned to h/w "
+                       "logical block size %hu of %s",
+                       dm_device_name(ti->table->md),
+                       (unsigned long long)start,
+                       limits->logical_block_size, bdevname(bdev, b));
+                return 0;
+        }
+        if (ti->len & (logical_block_size_sectors - 1)) {
+                DMWARN("%s: len=%llu not aligned to h/w "
+                       "logical block size %hu of %s",
+                       dm_device_name(ti->table->md),
+                       (unsigned long long)ti->len,
+                       limits->logical_block_size, bdevname(bdev, b));
+                return 0;
+        }
+        return 1;
 }
 /*
@@ -479,38 +472,32 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
        }
        atomic_inc(&dd->count);
-        if (!check_device_area(dd, start, len)) {
-                DMWARN("device %s too small for target", path);
-                dm_put_device(ti, &dd->dm_dev);
-                return -EINVAL;
-        }
        *result = &dd->dm_dev;
        return 0;
 }
-void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
+/*
+ * Returns the minimum that is _not_ zero, unless both are zero.
+ */
+#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
+int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
+                         sector_t start, void *data)
 {
+        struct queue_limits *limits = data;
+        struct block_device *bdev = dev->bdev;
        struct request_queue *q = bdev_get_queue(bdev);
-        struct io_restrictions *rs = &ti->limits;
        char b[BDEVNAME_SIZE];
        if (unlikely(!q)) {
                DMWARN("%s: Cannot set limits for nonexistent device %s",
                       dm_device_name(ti->table->md), bdevname(bdev, b));
-                return;
+                return 0;
        }
-        /*
+        if (blk_stack_limits(limits, &q->limits, start) < 0)
-         * Combine the device limits low.
+                DMWARN("%s: target device %s is misaligned",
-         *
+                       dm_device_name(ti->table->md), bdevname(bdev, b));
-         * FIXME: if we move an io_restriction struct
-         *        into q this would just be a call to
-         *        combine_restrictions_low()
-         */
-        rs->max_sectors =
-                min_not_zero(rs->max_sectors, queue_max_sectors(q));
        /*
         * Check if merge fn is supported.
@@ -519,48 +506,21 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
         */
        if (q->merge_bvec_fn && !ti->type->merge)
-                rs->max_sectors =
+                limits->max_sectors =
-                        min_not_zero(rs->max_sectors,
+                        min_not_zero(limits->max_sectors,
                                     (unsigned int) (PAGE_SIZE >> 9));
+        return 0;
-        rs->max_phys_segments =
-                min_not_zero(rs->max_phys_segments,
-                             queue_max_phys_segments(q));
-        rs->max_hw_segments =
-                min_not_zero(rs->max_hw_segments, queue_max_hw_segments(q));
-        rs->logical_block_size = max(rs->logical_block_size,
-                                     queue_logical_block_size(q));
-        rs->max_segment_size =
-                min_not_zero(rs->max_segment_size, queue_max_segment_size(q));
-        rs->max_hw_sectors =
-                min_not_zero(rs->max_hw_sectors, queue_max_hw_sectors(q));
-        rs->seg_boundary_mask =
-                min_not_zero(rs->seg_boundary_mask,
-                             queue_segment_boundary(q));
-        rs->bounce_pfn = min_not_zero(rs->bounce_pfn, queue_bounce_pfn(q));
-        rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
 }
 EXPORT_SYMBOL_GPL(dm_set_device_limits);
 int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
                  sector_t len, fmode_t mode, struct dm_dev **result)
 {
-        int r = __table_get_device(ti->table, ti, path,
+        return __table_get_device(ti->table, ti, path,
-                                   start, len, mode, result);
+                                  start, len, mode, result);
-        if (!r)
-                dm_set_device_limits(ti, (*result)->bdev);
-        return r;
 }
 /*
 * Decrement a devices use count and remove it if necessary.
 */
@@ -675,24 +635,78 @@ int dm_split_args(int *argc, char ***argvp, char *input)
        return 0;
 }
-static void check_for_valid_limits(struct io_restrictions *rs)
+/*
+ * Impose necessary and sufficient conditions on a devices's table such
+ * that any incoming bio which respects its logical_block_size can be
+ * processed successfully.  If it falls across the boundary between
+ * two or more targets, the size of each piece it gets split into must
+ * be compatible with the logical_block_size of the target processing it.
+ */
+static int validate_hardware_logical_block_alignment(struct dm_table *table,
+                                                 struct queue_limits *limits)
 {
-        if (!rs->max_sectors)
+        /*
-                rs->max_sectors = SAFE_MAX_SECTORS;
+         * This function uses arithmetic modulo the logical_block_size
-        if (!rs->max_hw_sectors)
+         * (in units of 512-byte sectors).
-                rs->max_hw_sectors = SAFE_MAX_SECTORS;
+         */
-        if (!rs->max_phys_segments)
+        unsigned short device_logical_block_size_sects =
-                rs->max_phys_segments = MAX_PHYS_SEGMENTS;
+                limits->logical_block_size >> SECTOR_SHIFT;
-        if (!rs->max_hw_segments)
-                rs->max_hw_segments = MAX_HW_SEGMENTS;
+        /*
-        if (!rs->logical_block_size)
+         * Offset of the start of the next table entry, mod logical_block_size.
-                rs->logical_block_size = 1 << SECTOR_SHIFT;
+         */
-        if (!rs->max_segment_size)
+        unsigned short next_target_start = 0;
-                rs->max_segment_size = MAX_SEGMENT_SIZE;
-        if (!rs->seg_boundary_mask)
+        /*
-                rs->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
+         * Given an aligned bio that extends beyond the end of a
-        if (!rs->bounce_pfn)
+         * target, how many sectors must the next target handle?
-                rs->bounce_pfn = -1;
+         */
+        unsigned short remaining = 0;
+        struct dm_target *uninitialized_var(ti);
+        struct queue_limits ti_limits;
+        unsigned i = 0;
+        /*
+         * Check each entry in the table in turn.
+         */
+        while (i < dm_table_get_num_targets(table)) {
+                ti = dm_table_get_target(table, i++);
+                blk_set_default_limits(&ti_limits);
+                /* combine all target devices' limits */
+                if (ti->type->iterate_devices)
+                        ti->type->iterate_devices(ti, dm_set_device_limits,
+                                                  &ti_limits);
+                /*
+                 * If the remaining sectors fall entirely within this
+                 * table entry are they compatible with its logical_block_size?
+                 */
+                if (remaining < ti->len &&
+                    remaining & ((ti_limits.logical_block_size >>
+                                  SECTOR_SHIFT) - 1))
+                        break;  /* Error */
+                next_target_start =
+                    (unsigned short) ((next_target_start + ti->len) &
+                                      (device_logical_block_size_sects - 1));
+                remaining = next_target_start ?
+                    device_logical_block_size_sects - next_target_start : 0;
+        }
+        if (remaining) {
+                DMWARN("%s: table line %u (start sect %llu len %llu) "
+                       "not aligned to h/w logical block size %hu",
+                       dm_device_name(table->md), i,
+                       (unsigned long long) ti->begin,
+                       (unsigned long long) ti->len,
+                       limits->logical_block_size);
+                return -EINVAL;
+        }
+        return 0;
 }
 int dm_table_add_target(struct dm_table *t, const char *type,
@@ -747,9 +761,6 @@ int dm_table_add_target(struct dm_table *t, const char *type,
        t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
-        /* FIXME: the plan is to combine high here and then have
-         * the merge fn apply the target level restrictions. */
-        combine_restrictions_low(&t->limits, &tgt->limits);
        return 0;
 bad:
@@ -758,6 +769,104 @@ int dm_table_add_target(struct dm_table *t, const char *type,
        return r;
 }
+int dm_table_set_type(struct dm_table *t)
+{
+        unsigned i;
+        unsigned bio_based = 0, request_based = 0;
+        struct dm_target *tgt;
+        struct dm_dev_internal *dd;
+        struct list_head *devices;
+        for (i = 0; i < t->num_targets; i++) {
+                tgt = t->targets + i;
+                if (dm_target_request_based(tgt))
+                        request_based = 1;
+                else
+                        bio_based = 1;
+                if (bio_based && request_based) {
+                        DMWARN("Inconsistent table: different target types"
+                               " can't be mixed up");
+                        return -EINVAL;
+                }
+        }
+        if (bio_based) {
+                /* We must use this table as bio-based */
+                t->type = DM_TYPE_BIO_BASED;
+                return 0;
+        }
+        BUG_ON(!request_based); /* No targets in this table */
+        /* Non-request-stackable devices can't be used for request-based dm */
+        devices = dm_table_get_devices(t);
+        list_for_each_entry(dd, devices, list) {
+                if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev.bdev))) {
+                        DMWARN("table load rejected: including"
+                               " non-request-stackable devices");
+                        return -EINVAL;
+                }
+        }
+        /*
+         * Request-based dm supports only tables that have a single target now.
+         * To support multiple targets, request splitting support is needed,
+         * and that needs lots of changes in the block-layer.
+         * (e.g. request completion process for partial completion.)
+         */
+        if (t->num_targets > 1) {
+                DMWARN("Request-based dm doesn't support multiple targets yet");
+                return -EINVAL;
+        }
+        t->type = DM_TYPE_REQUEST_BASED;
+        return 0;
+}
+unsigned dm_table_get_type(struct dm_table *t)
+{
+        return t->type;
+}
+bool dm_table_bio_based(struct dm_table *t)
+{
+        return dm_table_get_type(t) == DM_TYPE_BIO_BASED;
+}
+bool dm_table_request_based(struct dm_table *t)
+{
+        return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED;
+}
+int dm_table_alloc_md_mempools(struct dm_table *t)
+{
+        unsigned type = dm_table_get_type(t);
+        if (unlikely(type == DM_TYPE_NONE)) {
+                DMWARN("no table type is set, can't allocate mempools");
+                return -EINVAL;
+        }
+        t->mempools = dm_alloc_md_mempools(type);
+        if (!t->mempools)
+                return -ENOMEM;
+        return 0;
+}
+void dm_table_free_md_mempools(struct dm_table *t)
+{
+        dm_free_md_mempools(t->mempools);
+        t->mempools = NULL;
+}
+struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t)
+{
+        return t->mempools;
+}
 static int setup_indexes(struct dm_table *t)
 {
        int i;
@@ -792,8 +901,6 @@ int dm_table_complete(struct dm_table *t)
        int r = 0;
        unsigned int leaf_nodes;
-        check_for_valid_limits(&t->limits);
        /* how many indexes will the btree have ? */
        leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
        t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
@@ -869,6 +976,57 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
 }
 /*
+ * Establish the new table's queue_limits and validate them.
+ */
+int dm_calculate_queue_limits(struct dm_table *table,
+                              struct queue_limits *limits)
+{
+        struct dm_target *uninitialized_var(ti);
+        struct queue_limits ti_limits;
+        unsigned i = 0;
+        blk_set_default_limits(limits);
+        while (i < dm_table_get_num_targets(table)) {
+                blk_set_default_limits(&ti_limits);
+                ti = dm_table_get_target(table, i++);
+                if (!ti->type->iterate_devices)
+                        goto combine_limits;
+                /*
+                 * Combine queue limits of all the devices this target uses.
+                 */
+                ti->type->iterate_devices(ti, dm_set_device_limits,
+                                          &ti_limits);
+                /*
+                 * Check each device area is consistent with the target's
+                 * overall queue limits.
+                 */
+                if (!ti->type->iterate_devices(ti, device_area_is_valid,
+                                               &ti_limits))
+                        return -EINVAL;
+combine_limits:
+                /*
+                 * Merge this target's queue limits into the overall limits
+                 * for the table.
+                 */
+                if (blk_stack_limits(limits, &ti_limits, 0) < 0)
+                        DMWARN("%s: target device "
+                               "(start sect %llu len %llu) "
+                               "is misaligned",
+                               dm_device_name(table->md),
+                               (unsigned long long) ti->begin,
+                               (unsigned long long) ti->len);
+        }
+        return validate_hardware_logical_block_alignment(table, limits);
+}
+/*
 * Set the integrity profile for this device if all devices used have
 * matching profiles.
 */
@@ -907,27 +1065,42 @@ no_integrity:
        return;
 }
-void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
+void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
+                               struct queue_limits *limits)
 {
        /*
-         * Make sure we obey the optimistic sub devices
+         * Each target device in the table has a data area that should normally
-         * restrictions.
+         * be aligned such that the DM device's alignment_offset is 0.
+         * FIXME: Propagate alignment_offsets up the stack and warn of
+         *        sub-optimal or inconsistent settings.
+         */
+        limits->alignment_offset = 0;
+        limits->misaligned = 0;
+        /*
+         * Copy table's limits to the DM device's request_queue
         */
-        blk_queue_max_sectors(q, t->limits.max_sectors);
+        q->limits = *limits;
-        blk_queue_max_phys_segments(q, t->limits.max_phys_segments);
-        blk_queue_max_hw_segments(q, t->limits.max_hw_segments);
+        if (limits->no_cluster)
-        blk_queue_logical_block_size(q, t->limits.logical_block_size);
-        blk_queue_max_segment_size(q, t->limits.max_segment_size);
-        blk_queue_max_hw_sectors(q, t->limits.max_hw_sectors);
-        blk_queue_segment_boundary(q, t->limits.seg_boundary_mask);
-        blk_queue_bounce_limit(q, t->limits.bounce_pfn);
-        if (t->limits.no_cluster)
                queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q);
        else
                queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q);
        dm_table_set_integrity(t);
+        /*
+         * QUEUE_FLAG_STACKABLE must be set after all queue settings are
+         * visible to other CPUs because, once the flag is set, incoming bios
+         * are processed by request-based dm, which refers to the queue
+         * settings.
+         * Until the flag set, bios are passed to bio-based dm and queued to
+         * md->deferred where queue settings are not needed yet.
+         * Those bios are passed to request-based dm at the resume time.
+         */
+        smp_mb();
+        if (dm_table_request_based(t))
+                queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q);
 }
 unsigned int dm_table_get_num_targets(struct dm_table *t)
@@ -1023,6 +1196,20 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
        return r;
 }
+int dm_table_any_busy_target(struct dm_table *t)
+{
+        unsigned i;
+        struct dm_target *ti;
+        for (i = 0; i < t->num_targets; i++) {
+                ti = t->targets + i;
+                if (ti->type->busy && ti->type->busy(ti))
+                        return 1;
+        }
+        return 0;
+}
 void dm_table_unplug_all(struct dm_table *t)
 {
        struct dm_dev_internal *dd;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 48db308fae67..3c6d4ee8921d 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -24,6 +24,13 @@
 #define DM_MSG_PREFIX "core"
+/*
+ * Cookies are numeric values sent with CHANGE and REMOVE
+ * uevents while resuming, removing or renaming the device.
+ */
+#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
+#define DM_COOKIE_LENGTH 24
 static const char *_name = DM_NAME;
 static unsigned int major = 0;
@@ -71,7 +78,7 @@ struct dm_rq_target_io {
 */
 struct dm_rq_clone_bio_info {
        struct bio *orig;
-        struct request *rq;
+        struct dm_rq_target_io *tio;
 };
 union map_info *dm_get_mapinfo(struct bio *bio)
@@ -81,6 +88,14 @@ union map_info *dm_get_mapinfo(struct bio *bio)
        return NULL;
 }
+union map_info *dm_get_rq_mapinfo(struct request *rq)
+{
+        if (rq && rq->end_io_data)
+                return &((struct dm_rq_target_io *)rq->end_io_data)->info;
+        return NULL;
+}
+EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
 #define MINOR_ALLOCED ((void *)-1)
 /*
@@ -157,13 +172,31 @@ struct mapped_device {
         * freeze/thaw support require holding onto a super block
         */
        struct super_block *frozen_sb;
-        struct block_device *suspended_bdev;
+        struct block_device *bdev;
        /* forced geometry settings */
        struct hd_geometry geometry;
+        /* marker of flush suspend for request-based dm */
+        struct request suspend_rq;
+        /* For saving the address of __make_request for request based dm */
+        make_request_fn *saved_make_request_fn;
        /* sysfs handle */
        struct kobject kobj;
+        /* zero-length barrier that will be cloned and submitted to targets */
+        struct bio barrier_bio;
+};
+/*
+ * For mempools pre-allocation at the table loading time.
+ */
+struct dm_md_mempools {
+        mempool_t *io_pool;
+        mempool_t *tio_pool;
+        struct bio_set *bs;
 };
 #define MIN_IOS 256
@@ -391,14 +424,29 @@ static void free_io(struct mapped_device *md, struct dm_io *io)
        mempool_free(io, md->io_pool);
 }
-static struct dm_target_io *alloc_tio(struct mapped_device *md)
+static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
 {
-        return mempool_alloc(md->tio_pool, GFP_NOIO);
+        mempool_free(tio, md->tio_pool);
 }
-static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
+static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md)
 {
-        mempool_free(tio, md->tio_pool);
+        return mempool_alloc(md->tio_pool, GFP_ATOMIC);
+}
+static void free_rq_tio(struct dm_rq_target_io *tio)
+{
+        mempool_free(tio, tio->md->tio_pool);
+}
+static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md)
+{
+        return mempool_alloc(md->io_pool, GFP_ATOMIC);
+}
+static void free_bio_info(struct dm_rq_clone_bio_info *info)
+{
+        mempool_free(info, info->tio->md->io_pool);
 }
 static void start_io_acct(struct dm_io *io)
@@ -464,12 +512,13 @@ static void queue_io(struct mapped_device *md, struct bio *bio)
 struct dm_table *dm_get_table(struct mapped_device *md)
 {
        struct dm_table *t;
+        unsigned long flags;
-        read_lock(&md->map_lock);
+        read_lock_irqsave(&md->map_lock, flags);
        t = md->map;
        if (t)
                dm_table_get(t);
-        read_unlock(&md->map_lock);
+        read_unlock_irqrestore(&md->map_lock, flags);
        return t;
 }
@@ -536,9 +585,11 @@ static void dec_pending(struct dm_io *io, int error)
                         * Target requested pushing back the I/O.
                         */
                        spin_lock_irqsave(&md->deferred_lock, flags);
-                        if (__noflush_suspending(md))
+                        if (__noflush_suspending(md)) {
-                                bio_list_add_head(&md->deferred, io->bio);
+                                if (!bio_barrier(io->bio))
-                        else
+                                        bio_list_add_head(&md->deferred,
+                                                          io->bio);
+                        } else
                                /* noflush suspend was interrupted. */
                                io->error = -EIO;
                        spin_unlock_irqrestore(&md->deferred_lock, flags);
@@ -553,7 +604,8 @@ static void dec_pending(struct dm_io *io, int error)
                         * a per-device variable for error reporting.
                         * Note that you can't touch the bio after end_io_acct
                         */
-                        md->barrier_error = io_error;
+                        if (!md->barrier_error && io_error != -EOPNOTSUPP)
+                                md->barrier_error = io_error;
                        end_io_acct(io);
                } else {
                        end_io_acct(io);
@@ -607,6 +659,262 @@ static void clone_endio(struct bio *bio, int error)
        dec_pending(io, error);
 }
+/*
+ * Partial completion handling for request-based dm
+ */
+static void end_clone_bio(struct bio *clone, int error)
+{
+        struct dm_rq_clone_bio_info *info = clone->bi_private;
+        struct dm_rq_target_io *tio = info->tio;
+        struct bio *bio = info->orig;
+        unsigned int nr_bytes = info->orig->bi_size;
+        bio_put(clone);
+        if (tio->error)
+                /*
+                 * An error has already been detected on the request.
+                 * Once error occurred, just let clone->end_io() handle
+                 * the remainder.
+                 */
+                return;
+        else if (error) {
+                /*
+                 * Don't notice the error to the upper layer yet.
+                 * The error handling decision is made by the target driver,
+                 * when the request is completed.
+                 */
+                tio->error = error;
+                return;
+        }
+        /*
+         * I/O for the bio successfully completed.
+         * Notice the data completion to the upper layer.
+         */
+        /*
+         * bios are processed from the head of the list.
+         * So the completing bio should always be rq->bio.
+         * If it's not, something wrong is happening.
+         */
+        if (tio->orig->bio != bio)
+                DMERR("bio completion is going in the middle of the request");
+        /*
+         * Update the original request.
+         * Do not use blk_end_request() here, because it may complete
+         * the original request before the clone, and break the ordering.
+         */
+        blk_update_request(tio->orig, 0, nr_bytes);
+}
+/*
+ * Don't touch any member of the md after calling this function because
+ * the md may be freed in dm_put() at the end of this function.
+ * Or do dm_get() before calling this function and dm_put() later.
+ */
+static void rq_completed(struct mapped_device *md, int run_queue)
+{
+        int wakeup_waiters = 0;
+        struct request_queue *q = md->queue;
+        unsigned long flags;
+        spin_lock_irqsave(q->queue_lock, flags);
+        if (!queue_in_flight(q))
+                wakeup_waiters = 1;
+        spin_unlock_irqrestore(q->queue_lock, flags);
+        /* nudge anyone waiting on suspend queue */
+        if (wakeup_waiters)
+                wake_up(&md->wait);
+        if (run_queue)
+                blk_run_queue(q);
+        /*
+         * dm_put() must be at the end of this function. See the comment above
+         */
+        dm_put(md);
+}
+static void dm_unprep_request(struct request *rq)
+{
+        struct request *clone = rq->special;
+        struct dm_rq_target_io *tio = clone->end_io_data;
+        rq->special = NULL;
+        rq->cmd_flags &= ~REQ_DONTPREP;
+        blk_rq_unprep_clone(clone);
+        free_rq_tio(tio);
+}
+/*
+ * Requeue the original request of a clone.
+ */
+void dm_requeue_unmapped_request(struct request *clone)
+{
+        struct dm_rq_target_io *tio = clone->end_io_data;
+        struct mapped_device *md = tio->md;
+        struct request *rq = tio->orig;
+        struct request_queue *q = rq->q;
+        unsigned long flags;
+        dm_unprep_request(rq);
+        spin_lock_irqsave(q->queue_lock, flags);
+        if (elv_queue_empty(q))
+                blk_plug_device(q);
+        blk_requeue_request(q, rq);
+        spin_unlock_irqrestore(q->queue_lock, flags);
+        rq_completed(md, 0);
+}
+EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
+static void __stop_queue(struct request_queue *q)
+{
+        blk_stop_queue(q);
+}
+static void stop_queue(struct request_queue *q)
+{
+        unsigned long flags;
+        spin_lock_irqsave(q->queue_lock, flags);
+        __stop_queue(q);
+        spin_unlock_irqrestore(q->queue_lock, flags);
+}
+static void __start_queue(struct request_queue *q)
+{
+        if (blk_queue_stopped(q))
+                blk_start_queue(q);
+}
+static void start_queue(struct request_queue *q)
+{
+        unsigned long flags;
+        spin_lock_irqsave(q->queue_lock, flags);
+        __start_queue(q);
+        spin_unlock_irqrestore(q->queue_lock, flags);
+}
+/*
+ * Complete the clone and the original request.
+ * Must be called without queue lock.
+ */
+static void dm_end_request(struct request *clone, int error)
+{
+        struct dm_rq_target_io *tio = clone->end_io_data;
+        struct mapped_device *md = tio->md;
+        struct request *rq = tio->orig;
+        if (blk_pc_request(rq)) {
+                rq->errors = clone->errors;
+                rq->resid_len = clone->resid_len;
+                if (rq->sense)
+                        /*
+                         * We are using the sense buffer of the original
+                         * request.
+                         * So setting the length of the sense data is enough.
+                         */
+                        rq->sense_len = clone->sense_len;
+        }
+        BUG_ON(clone->bio);
+        free_rq_tio(tio);
+        blk_end_request_all(rq, error);
+        rq_completed(md, 1);
+}
+/*
+ * Request completion handler for request-based dm
+ */
+static void dm_softirq_done(struct request *rq)
+{
+        struct request *clone = rq->completion_data;
+        struct dm_rq_target_io *tio = clone->end_io_data;
+        dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
+        int error = tio->error;
+        if (!(rq->cmd_flags & REQ_FAILED) && rq_end_io)
+                error = rq_end_io(tio->ti, clone, error, &tio->info);
+        if (error <= 0)
+                /* The target wants to complete the I/O */
+                dm_end_request(clone, error);
+        else if (error == DM_ENDIO_INCOMPLETE)
+                /* The target will handle the I/O */
+                return;
+        else if (error == DM_ENDIO_REQUEUE)
+                /* The target wants to requeue the I/O */
+                dm_requeue_unmapped_request(clone);
+        else {
+                DMWARN("unimplemented target endio return value: %d", error);
+                BUG();
+        }
+}
+/*
+ * Complete the clone and the original request with the error status
+ * through softirq context.
+ */
+static void dm_complete_request(struct request *clone, int error)
+{
+        struct dm_rq_target_io *tio = clone->end_io_data;
+        struct request *rq = tio->orig;
+        tio->error = error;
+        rq->completion_data = clone;
+        blk_complete_request(rq);
+}
+/*
+ * Complete the not-mapped clone and the original request with the error status
+ * through softirq context.
+ * Target's rq_end_io() function isn't called.
+ * This may be used when the target's map_rq() function fails.
+ */
+void dm_kill_unmapped_request(struct request *clone, int error)
+{
+        struct dm_rq_target_io *tio = clone->end_io_data;
+        struct request *rq = tio->orig;
+        rq->cmd_flags |= REQ_FAILED;
+        dm_complete_request(clone, error);
+}
+EXPORT_SYMBOL_GPL(dm_kill_unmapped_request);
+/*
+ * Called with the queue lock held
+ */
+static void end_clone_request(struct request *clone, int error)
+{
+        /*
+         * For just cleaning up the information of the queue in which
+         * the clone was dispatched.
+         * The clone is *NOT* freed actually here because it is alloced from
+         * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.
+         */
+        __blk_put_request(clone->q, clone);
+        /*
+         * Actual request completion is done in a softirq context which doesn't
+         * hold the queue lock.  Otherwise, deadlock could occur because:
+         *     - another request may be submitted by the upper level driver
+         *       of the stacking during the completion
+         *     - the submission which requires queue lock may be done
+         *       against this queue
+         */
+        dm_complete_request(clone, error);
+}
 static sector_t max_io_len(struct mapped_device *md,
                           sector_t sector, struct dm_target *ti)
 {
@@ -634,11 +942,6 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
        sector_t sector;
        struct mapped_device *md;
-        /*
-         * Sanity checks.
-         */
-        BUG_ON(!clone->bi_size);
        clone->bi_end_io = clone_endio;
        clone->bi_private = tio;
@@ -752,6 +1055,48 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
        return clone;
 }
+static struct dm_target_io *alloc_tio(struct clone_info *ci,
+                                      struct dm_target *ti)
+{
+        struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO);
+        tio->io = ci->io;
+        tio->ti = ti;
+        memset(&tio->info, 0, sizeof(tio->info));
+        return tio;
+}
+static void __flush_target(struct clone_info *ci, struct dm_target *ti,
+                          unsigned flush_nr)
+{
+        struct dm_target_io *tio = alloc_tio(ci, ti);
+        struct bio *clone;
+        tio->info.flush_request = flush_nr;
+        clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs);
+        __bio_clone(clone, ci->bio);
+        clone->bi_destructor = dm_bio_destructor;
+        __map_bio(ti, clone, tio);
+}
+static int __clone_and_map_empty_barrier(struct clone_info *ci)
+{
+        unsigned target_nr = 0, flush_nr;
+        struct dm_target *ti;
+        while ((ti = dm_table_get_target(ci->map, target_nr++)))
+                for (flush_nr = 0; flush_nr < ti->num_flush_requests;
+                     flush_nr++)
+                        __flush_target(ci, ti, flush_nr);
+        ci->sector_count = 0;
+        return 0;
+}
 static int __clone_and_map(struct clone_info *ci)
 {
        struct bio *clone, *bio = ci->bio;
@@ -759,6 +1104,9 @@ static int __clone_and_map(struct clone_info *ci)
        sector_t len = 0, max;
        struct dm_target_io *tio;
+        if (unlikely(bio_empty_barrier(bio)))
+                return __clone_and_map_empty_barrier(ci);
        ti = dm_table_find_target(ci->map, ci->sector);
        if (!dm_target_is_valid(ti))
                return -EIO;
@@ -768,10 +1116,7 @@ static int __clone_and_map(struct clone_info *ci)
        /*
         * Allocate a target io object.
         */
-        tio = alloc_tio(ci->md);
+        tio = alloc_tio(ci, ti);
-        tio->io = ci->io;
-        tio->ti = ti;
-        memset(&tio->info, 0, sizeof(tio->info));
        if (ci->sector_count <= max) {
                /*
@@ -827,10 +1172,7 @@ static int __clone_and_map(struct clone_info *ci)
                                max = max_io_len(ci->md, ci->sector, ti);
-                                tio = alloc_tio(ci->md);
+                                tio = alloc_tio(ci, ti);
-                                tio->io = ci->io;
-                                tio->ti = ti;
-                                memset(&tio->info, 0, sizeof(tio->info));
                        }
                        len = min(remaining, max);
@@ -865,7 +1207,8 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
                if (!bio_barrier(bio))
                        bio_io_error(bio);
                else
-                        md->barrier_error = -EIO;
+                        if (!md->barrier_error)
+                                md->barrier_error = -EIO;
                return;
        }
@@ -878,6 +1221,8 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
        ci.io->md = md;
        ci.sector = bio->bi_sector;
        ci.sector_count = bio_sectors(bio);
+        if (unlikely(bio_empty_barrier(bio)))
+                ci.sector_count = 1;
        ci.idx = bio->bi_idx;
        start_io_acct(ci.io);
@@ -925,6 +1270,16 @@ static int dm_merge_bvec(struct request_queue *q,
         */
        if (max_size && ti->type->merge)
                max_size = ti->type->merge(ti, bvm, biovec, max_size);
+        /*
+         * If the target doesn't support merge method and some of the devices
+         * provided their merge_bvec method (we know this by looking at
+         * queue_max_hw_sectors), then we can't allow bios with multiple vector
+         * entries.  So always set max_size to 0, and the code below allows
+         * just one page.
+         */
+        else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
+                max_size = 0;
 out_table:
        dm_table_put(map);
@@ -943,7 +1298,7 @@ out:
 * The request function that just remaps the bio built up by
 * dm_merge_bvec.
 */
-static int dm_request(struct request_queue *q, struct bio *bio)
+static int _dm_request(struct request_queue *q, struct bio *bio)
 {
        int rw = bio_data_dir(bio);
        struct mapped_device *md = q->queuedata;
@@ -980,12 +1335,274 @@ static int dm_request(struct request_queue *q, struct bio *bio)
        return 0;
 }
+static int dm_make_request(struct request_queue *q, struct bio *bio)
+{
+        struct mapped_device *md = q->queuedata;
+        if (unlikely(bio_barrier(bio))) {
+                bio_endio(bio, -EOPNOTSUPP);
+                return 0;
+        }
+        return md->saved_make_request_fn(q, bio); /* call __make_request() */
+}
+static int dm_request_based(struct mapped_device *md)
+{
+        return blk_queue_stackable(md->queue);
+}
+static int dm_request(struct request_queue *q, struct bio *bio)
+{
+        struct mapped_device *md = q->queuedata;
+        if (dm_request_based(md))
+                return dm_make_request(q, bio);
+        return _dm_request(q, bio);
+}
+void dm_dispatch_request(struct request *rq)
+{
+        int r;
+        if (blk_queue_io_stat(rq->q))
+                rq->cmd_flags |= REQ_IO_STAT;
+        rq->start_time = jiffies;
+        r = blk_insert_cloned_request(rq->q, rq);
+        if (r)
+                dm_complete_request(rq, r);
+}
+EXPORT_SYMBOL_GPL(dm_dispatch_request);
+static void dm_rq_bio_destructor(struct bio *bio)
+{
+        struct dm_rq_clone_bio_info *info = bio->bi_private;
+        struct mapped_device *md = info->tio->md;
+        free_bio_info(info);
+        bio_free(bio, md->bs);
+}
+static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
+                                 void *data)
+{
+        struct dm_rq_target_io *tio = data;
+        struct mapped_device *md = tio->md;
+        struct dm_rq_clone_bio_info *info = alloc_bio_info(md);
+        if (!info)
+                return -ENOMEM;
+        info->orig = bio_orig;
+        info->tio = tio;
+        bio->bi_end_io = end_clone_bio;
+        bio->bi_private = info;
+        bio->bi_destructor = dm_rq_bio_destructor;
+        return 0;
+}
+static int setup_clone(struct request *clone, struct request *rq,
+                       struct dm_rq_target_io *tio)
+{
+        int r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
+                                  dm_rq_bio_constructor, tio);
+        if (r)
+                return r;
+        clone->cmd = rq->cmd;
+        clone->cmd_len = rq->cmd_len;
+        clone->sense = rq->sense;
+        clone->buffer = rq->buffer;
+        clone->end_io = end_clone_request;
+        clone->end_io_data = tio;
+        return 0;
+}
+static int dm_rq_flush_suspending(struct mapped_device *md)
+{
+        return !md->suspend_rq.special;
+}
+/*
+ * Called with the queue lock held.
+ */
+static int dm_prep_fn(struct request_queue *q, struct request *rq)
+{
+        struct mapped_device *md = q->queuedata;
+        struct dm_rq_target_io *tio;
+        struct request *clone;
+        if (unlikely(rq == &md->suspend_rq)) {
+                if (dm_rq_flush_suspending(md))
+                        return BLKPREP_OK;
+                else
+                        /* The flush suspend was interrupted */
+                        return BLKPREP_KILL;
+        }
+        if (unlikely(rq->special)) {
+                DMWARN("Already has something in rq->special.");
+                return BLKPREP_KILL;
+        }
+        tio = alloc_rq_tio(md); /* Only one for each original request */
+        if (!tio)
+                /* -ENOMEM */
+                return BLKPREP_DEFER;
+        tio->md = md;
+        tio->ti = NULL;
+        tio->orig = rq;
+        tio->error = 0;
+        memset(&tio->info, 0, sizeof(tio->info));
+        clone = &tio->clone;
+        if (setup_clone(clone, rq, tio)) {
+                /* -ENOMEM */
+                free_rq_tio(tio);
+                return BLKPREP_DEFER;
+        }
+        rq->special = clone;
+        rq->cmd_flags |= REQ_DONTPREP;
+        return BLKPREP_OK;
+}
+static void map_request(struct dm_target *ti, struct request *rq,
+                        struct mapped_device *md)
+{
+        int r;
+        struct request *clone = rq->special;
+        struct dm_rq_target_io *tio = clone->end_io_data;
+        /*
+         * Hold the md reference here for the in-flight I/O.
+         * We can't rely on the reference count by device opener,
+         * because the device may be closed during the request completion
+         * when all bios are completed.
+         * See the comment in rq_completed() too.
+         */
+        dm_get(md);
+        tio->ti = ti;
+        r = ti->type->map_rq(ti, clone, &tio->info);
+        switch (r) {
+        case DM_MAPIO_SUBMITTED:
+                /* The target has taken the I/O to submit by itself later */
+                break;
+        case DM_MAPIO_REMAPPED:
+                /* The target has remapped the I/O so dispatch it */
+                dm_dispatch_request(clone);
+                break;
+        case DM_MAPIO_REQUEUE:
+                /* The target wants to requeue the I/O */
+                dm_requeue_unmapped_request(clone);
+                break;
+        default:
+                if (r > 0) {
+                        DMWARN("unimplemented target map return value: %d", r);
+                        BUG();
+                }
+                /* The target wants to complete the I/O */
+                dm_kill_unmapped_request(clone, r);
+                break;
+        }
+}
+/*
+ * q->request_fn for request-based dm.
+ * Called with the queue lock held.
+ */
+static void dm_request_fn(struct request_queue *q)
+{
+        struct mapped_device *md = q->queuedata;
+        struct dm_table *map = dm_get_table(md);
+        struct dm_target *ti;
+        struct request *rq;
+        /*
+         * For noflush suspend, check blk_queue_stopped() to immediately
+         * quit I/O dispatching.
+         */
+        while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) {
+                rq = blk_peek_request(q);
+                if (!rq)
+                        goto plug_and_out;
+                if (unlikely(rq == &md->suspend_rq)) { /* Flush suspend maker */
+                        if (queue_in_flight(q))
+                                /* Not quiet yet.  Wait more */
+                                goto plug_and_out;
+                        /* This device should be quiet now */
+                        __stop_queue(q);
+                        blk_start_request(rq);
+                        __blk_end_request_all(rq, 0);
+                        wake_up(&md->wait);
+                        goto out;
+                }
+                ti = dm_table_find_target(map, blk_rq_pos(rq));
+                if (ti->type->busy && ti->type->busy(ti))
+                        goto plug_and_out;
+                blk_start_request(rq);
+                spin_unlock(q->queue_lock);
+                map_request(ti, rq, md);
+                spin_lock_irq(q->queue_lock);
+        }
+        goto out;
+plug_and_out:
+        if (!elv_queue_empty(q))
+                /* Some requests still remain, retry later */
+                blk_plug_device(q);
+out:
+        dm_table_put(map);
+        return;
+}
+int dm_underlying_device_busy(struct request_queue *q)
+{
+        return blk_lld_busy(q);
+}
+EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
+static int dm_lld_busy(struct request_queue *q)
+{
+        int r;
+        struct mapped_device *md = q->queuedata;
+        struct dm_table *map = dm_get_table(md);
+        if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
+                r = 1;
+        else
+                r = dm_table_any_busy_target(map);
+        dm_table_put(map);
+        return r;
+}
 static void dm_unplug_all(struct request_queue *q)
 {
        struct mapped_device *md = q->queuedata;
        struct dm_table *map = dm_get_table(md);
        if (map) {
+                if (dm_request_based(md))
+                        generic_unplug_device(q);
                dm_table_unplug_all(map);
                dm_table_put(map);
        }
@@ -1000,7 +1617,16 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
        if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
                map = dm_get_table(md);
                if (map) {
-                        r = dm_table_any_congested(map, bdi_bits);
+                        /*
+                         * Request-based dm cares about only own queue for
+                         * the query about congestion status of request_queue
+                         */
+                        if (dm_request_based(md))
+                                r = md->queue->backing_dev_info.state &
+                                    bdi_bits;
+                        else
+                                r = dm_table_any_congested(map, bdi_bits);
                        dm_table_put(map);
                }
        }
@@ -1123,30 +1749,32 @@ static struct mapped_device *alloc_dev(int minor)
        INIT_LIST_HEAD(&md->uevent_list);
        spin_lock_init(&md->uevent_lock);
-        md->queue = blk_alloc_queue(GFP_KERNEL);
+        md->queue = blk_init_queue(dm_request_fn, NULL);
        if (!md->queue)
                goto bad_queue;
+        /*
+         * Request-based dm devices cannot be stacked on top of bio-based dm
+         * devices.  The type of this dm device has not been decided yet,
+         * although we initialized the queue using blk_init_queue().
+         * The type is decided at the first table loading time.
+         * To prevent problematic device stacking, clear the queue flag
+         * for request stacking support until then.
+         *
+         * This queue is new, so no concurrency on the queue_flags.
+         */
+        queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
+        md->saved_make_request_fn = md->queue->make_request_fn;
        md->queue->queuedata = md;
        md->queue->backing_dev_info.congested_fn = dm_any_congested;
        md->queue->backing_dev_info.congested_data = md;
        blk_queue_make_request(md->queue, dm_request);
-        blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL);
        blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
        md->queue->unplug_fn = dm_unplug_all;
        blk_queue_merge_bvec(md->queue, dm_merge_bvec);
+        blk_queue_softirq_done(md->queue, dm_softirq_done);
-        md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache);
+        blk_queue_prep_rq(md->queue, dm_prep_fn);
-        if (!md->io_pool)
+        blk_queue_lld_busy(md->queue, dm_lld_busy);
-                goto bad_io_pool;
-        md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache);
-        if (!md->tio_pool)
-                goto bad_tio_pool;
-        md->bs = bioset_create(16, 0);
-        if (!md->bs)
-                goto bad_no_bioset;
        md->disk = alloc_disk(1);
        if (!md->disk)
@@ -1170,6 +1798,10 @@ static struct mapped_device *alloc_dev(int minor)
        if (!md->wq)
                goto bad_thread;
+        md->bdev = bdget_disk(md->disk, 0);
+        if (!md->bdev)
+                goto bad_bdev;
        /* Populate the mapping, nobody knows we exist yet */
        spin_lock(&_minor_lock);
        old_md = idr_replace(&_minor_idr, md, minor);
@@ -1179,15 +1811,11 @@ static struct mapped_device *alloc_dev(int minor)
        return md;
+bad_bdev:
+        destroy_workqueue(md->wq);
 bad_thread:
        put_disk(md->disk);
 bad_disk:
-        bioset_free(md->bs);
-bad_no_bioset:
-        mempool_destroy(md->tio_pool);
-bad_tio_pool:
-        mempool_destroy(md->io_pool);
-bad_io_pool:
        blk_cleanup_queue(md->queue);
 bad_queue:
        free_minor(minor);
@@ -1204,14 +1832,15 @@ static void free_dev(struct mapped_device *md)
 {
        int minor = MINOR(disk_devt(md->disk));
-        if (md->suspended_bdev) {
+        unlock_fs(md);
-                unlock_fs(md);
+        bdput(md->bdev);
-                bdput(md->suspended_bdev);
-        }
        destroy_workqueue(md->wq);
-        mempool_destroy(md->tio_pool);
+        if (md->tio_pool)
-        mempool_destroy(md->io_pool);
+                mempool_destroy(md->tio_pool);
-        bioset_free(md->bs);
+        if (md->io_pool)
+                mempool_destroy(md->io_pool);
+        if (md->bs)
+                bioset_free(md->bs);
        blk_integrity_unregister(md->disk);
        del_gendisk(md->disk);
        free_minor(minor);
@@ -1226,6 +1855,29 @@ static void free_dev(struct mapped_device *md)
        kfree(md);
 }
+static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
+{
+        struct dm_md_mempools *p;
+        if (md->io_pool && md->tio_pool && md->bs)
+                /* the md already has necessary mempools */
+                goto out;
+        p = dm_table_get_md_mempools(t);
+        BUG_ON(!p || md->io_pool || md->tio_pool || md->bs);
+        md->io_pool = p->io_pool;
+        p->io_pool = NULL;
+        md->tio_pool = p->tio_pool;
+        p->tio_pool = NULL;
+        md->bs = p->bs;
+        p->bs = NULL;
+out:
+        /* mempool bind completed, now no need any mempools in the table */
+        dm_table_free_md_mempools(t);
+}
 /*
 * Bind a table to the device.
 */
@@ -1249,15 +1901,17 @@ static void __set_size(struct mapped_device *md, sector_t size)
 {
        set_capacity(md->disk, size);
-        mutex_lock(&md->suspended_bdev->bd_inode->i_mutex);
+        mutex_lock(&md->bdev->bd_inode->i_mutex);
-        i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
+        i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
-        mutex_unlock(&md->suspended_bdev->bd_inode->i_mutex);
+        mutex_unlock(&md->bdev->bd_inode->i_mutex);
 }
-static int __bind(struct mapped_device *md, struct dm_table *t)
+static int __bind(struct mapped_device *md, struct dm_table *t,
+                  struct queue_limits *limits)
 {
        struct request_queue *q = md->queue;
        sector_t size;
+        unsigned long flags;
        size = dm_table_get_size(t);
@@ -1267,8 +1921,7 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
        if (size != get_capacity(md->disk))
                memset(&md->geometry, 0, sizeof(md->geometry));
-        if (md->suspended_bdev)
+        __set_size(md, size);
-                __set_size(md, size);
        if (!size) {
                dm_table_destroy(t);
@@ -1277,10 +1930,22 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
        dm_table_event_callback(t, event_callback, md);
-        write_lock(&md->map_lock);
+        /*
+         * The queue hasn't been stopped yet, if the old table type wasn't
+         * for request-based during suspension.  So stop it to prevent
+         * I/O mapping before resume.
+         * This must be done before setting the queue restrictions,
+         * because request-based dm may be run just after the setting.
+         */
+        if (dm_table_request_based(t) && !blk_queue_stopped(q))
+                stop_queue(q);
+        __bind_mempools(md, t);
+        write_lock_irqsave(&md->map_lock, flags);
        md->map = t;
-        dm_table_set_restrictions(t, q);
+        dm_table_set_restrictions(t, q, limits);
-        write_unlock(&md->map_lock);
+        write_unlock_irqrestore(&md->map_lock, flags);
        return 0;
 }
@@ -1288,14 +1953,15 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
 static void __unbind(struct mapped_device *md)
 {
        struct dm_table *map = md->map;
+        unsigned long flags;
        if (!map)
                return;
        dm_table_event_callback(map, NULL, NULL);
-        write_lock(&md->map_lock);
+        write_lock_irqsave(&md->map_lock, flags);
        md->map = NULL;
-        write_unlock(&md->map_lock);
+        write_unlock_irqrestore(&md->map_lock, flags);
        dm_table_destroy(map);
 }
@@ -1399,6 +2065,8 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 {
        int r = 0;
        DECLARE_WAITQUEUE(wait, current);
+        struct request_queue *q = md->queue;
+        unsigned long flags;
        dm_unplug_all(md->queue);
@@ -1408,7 +2076,14 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
                set_current_state(interruptible);
                smp_mb();
-                if (!atomic_read(&md->pending))
+                if (dm_request_based(md)) {
+                        spin_lock_irqsave(q->queue_lock, flags);
+                        if (!queue_in_flight(q) && blk_queue_stopped(q)) {
+                                spin_unlock_irqrestore(q->queue_lock, flags);
+                                break;
+                        }
+                        spin_unlock_irqrestore(q->queue_lock, flags);
+                } else if (!atomic_read(&md->pending))
                        break;
                if (interruptible == TASK_INTERRUPTIBLE &&
@@ -1426,34 +2101,36 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
        return r;
 }
-static int dm_flush(struct mapped_device *md)
+static void dm_flush(struct mapped_device *md)
 {
        dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
-        return 0;
+        bio_init(&md->barrier_bio);
+        md->barrier_bio.bi_bdev = md->bdev;
+        md->barrier_bio.bi_rw = WRITE_BARRIER;
+        __split_and_process_bio(md, &md->barrier_bio);
+        dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
 }
 static void process_barrier(struct mapped_device *md, struct bio *bio)
 {
-        int error = dm_flush(md);
+        md->barrier_error = 0;
-        if (unlikely(error)) {
-                bio_endio(bio, error);
-                return;
-        }
-        if (bio_empty_barrier(bio)) {
-                bio_endio(bio, 0);
-                return;
-        }
-        __split_and_process_bio(md, bio);
-        error = dm_flush(md);
+        dm_flush(md);
-        if (!error && md->barrier_error)
+        if (!bio_empty_barrier(bio)) {
-                error = md->barrier_error;
+                __split_and_process_bio(md, bio);
+                dm_flush(md);
+        }
        if (md->barrier_error != DM_ENDIO_REQUEUE)
-                bio_endio(bio, error);
+                bio_endio(bio, md->barrier_error);
+        else {
+                spin_lock_irq(&md->deferred_lock);
+                bio_list_add_head(&md->deferred, bio);
+                spin_unlock_irq(&md->deferred_lock);
+        }
 }
 /*
@@ -1479,10 +2156,14 @@ static void dm_wq_work(struct work_struct *work)
                up_write(&md->io_lock);
-                if (bio_barrier(c))
+                if (dm_request_based(md))
-                        process_barrier(md, c);
+                        generic_make_request(c);
-                else
+                else {
-                        __split_and_process_bio(md, c);
+                        if (bio_barrier(c))
+                                process_barrier(md, c);
+                        else
+                                __split_and_process_bio(md, c);
+                }
                down_write(&md->io_lock);
        }
@@ -1502,6 +2183,7 @@ static void dm_queue_flush(struct mapped_device *md)
 */
 int dm_swap_table(struct mapped_device *md, struct dm_table *table)
 {
+        struct queue_limits limits;
        int r = -EINVAL;
        mutex_lock(&md->suspend_lock);
@@ -1510,19 +2192,96 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table)
        if (!dm_suspended(md))
                goto out;
-        /* without bdev, the device size cannot be changed */
+        r = dm_calculate_queue_limits(table, &limits);
-        if (!md->suspended_bdev)
+        if (r)
-                if (get_capacity(md->disk) != dm_table_get_size(table))
+                goto out;
-                        goto out;
+        /* cannot change the device type, once a table is bound */
+        if (md->map &&
+            (dm_table_get_type(md->map) != dm_table_get_type(table))) {
+                DMWARN("can't change the device type after a table is bound");
+                goto out;
+        }
+        /*
+         * It is enought that blk_queue_ordered() is called only once when
+         * the first bio-based table is bound.
+         *
+         * This setting should be moved to alloc_dev() when request-based dm
+         * supports barrier.
+         */
+        if (!md->map && dm_table_bio_based(table))
+                blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL);
        __unbind(md);
-        r = __bind(md, table);
+        r = __bind(md, table, &limits);
 out:
        mutex_unlock(&md->suspend_lock);
        return r;
 }
+static void dm_rq_invalidate_suspend_marker(struct mapped_device *md)
+{
+        md->suspend_rq.special = (void *)0x1;
+}
+static void dm_rq_abort_suspend(struct mapped_device *md, int noflush)
+{
+        struct request_queue *q = md->queue;
+        unsigned long flags;
+        spin_lock_irqsave(q->queue_lock, flags);
+        if (!noflush)
+                dm_rq_invalidate_suspend_marker(md);
+        __start_queue(q);
+        spin_unlock_irqrestore(q->queue_lock, flags);
+}
+static void dm_rq_start_suspend(struct mapped_device *md, int noflush)
+{
+        struct request *rq = &md->suspend_rq;
+        struct request_queue *q = md->queue;
+        if (noflush)
+                stop_queue(q);
+        else {
+                blk_rq_init(q, rq);
+                blk_insert_request(q, rq, 0, NULL);
+        }
+}
+static int dm_rq_suspend_available(struct mapped_device *md, int noflush)
+{
+        int r = 1;
+        struct request *rq = &md->suspend_rq;
+        struct request_queue *q = md->queue;
+        unsigned long flags;
+        if (noflush)
+                return r;
+        /* The marker must be protected by queue lock if it is in use */
+        spin_lock_irqsave(q->queue_lock, flags);
+        if (unlikely(rq->ref_count)) {
+                /*
+                 * This can happen, when the previous flush suspend was
+                 * interrupted, the marker is still in the queue and
+                 * this flush suspend has been invoked, because we don't
+                 * remove the marker at the time of suspend interruption.
+                 * We have only one marker per mapped_device, so we can't
+                 * start another flush suspend while it is in use.
+                 */
+                BUG_ON(!rq->special); /* The marker should be invalidated */
+                DMWARN("Invalidating the previous flush suspend is still in"
+                       " progress.  Please retry later.");
+                r = 0;
+        }
+        spin_unlock_irqrestore(q->queue_lock, flags);
+        return r;
+}
 /*
 * Functions to lock and unlock any filesystem running on the
 * device.
@@ -1533,7 +2292,7 @@ static int lock_fs(struct mapped_device *md)
        WARN_ON(md->frozen_sb);
-        md->frozen_sb = freeze_bdev(md->suspended_bdev);
+        md->frozen_sb = freeze_bdev(md->bdev);
        if (IS_ERR(md->frozen_sb)) {
                r = PTR_ERR(md->frozen_sb);
                md->frozen_sb = NULL;
@@ -1542,9 +2301,6 @@ static int lock_fs(struct mapped_device *md)
        set_bit(DMF_FROZEN, &md->flags);
-        /* don't bdput right now, we don't want the bdev
-         * to go away while it is locked.
-         */
        return 0;
 }
@@ -1553,7 +2309,7 @@ static void unlock_fs(struct mapped_device *md)
        if (!test_bit(DMF_FROZEN, &md->flags))
                return;
-        thaw_bdev(md->suspended_bdev, md->frozen_sb);
+        thaw_bdev(md->bdev, md->frozen_sb);
        md->frozen_sb = NULL;
        clear_bit(DMF_FROZEN, &md->flags);
 }
@@ -1565,6 +2321,53 @@ static void unlock_fs(struct mapped_device *md)
 * dm_bind_table, dm_suspend must be called to flush any in
 * flight bios and ensure that any further io gets deferred.
 */
+/*
+ * Suspend mechanism in request-based dm.
+ *
+ * After the suspend starts, further incoming requests are kept in
+ * the request_queue and deferred.
+ * Remaining requests in the request_queue at the start of suspend are flushed
+ * if it is flush suspend.
+ * The suspend completes when the following conditions have been satisfied,
+ * so wait for it:
+ *    1. q->in_flight is 0 (which means no in_flight request)
+ *    2. queue has been stopped (which means no request dispatching)
+ *
+ *
+ * Noflush suspend
+ * ---------------
+ * Noflush suspend doesn't need to dispatch remaining requests.
+ * So stop the queue immediately.  Then, wait for all in_flight requests
+ * to be completed or requeued.
+ *
+ * To abort noflush suspend, start the queue.
+ *
+ *
+ * Flush suspend
+ * -------------
+ * Flush suspend needs to dispatch remaining requests.  So stop the queue
+ * after the remaining requests are completed. (Requeued request must be also
+ * re-dispatched and completed.  Until then, we can't stop the queue.)
+ *
+ * During flushing the remaining requests, further incoming requests are also
+ * inserted to the same queue.  To distinguish which requests are to be
+ * flushed, we insert a marker request to the queue at the time of starting
+ * flush suspend, like a barrier.
+ * The dispatching is blocked when the marker is found on the top of the queue.
+ * And the queue is stopped when all in_flight requests are completed, since
+ * that means the remaining requests are completely flushed.
+ * Then, the marker is removed from the queue.
+ *
+ * To abort flush suspend, we also need to take care of the marker, not only
+ * starting the queue.
+ * We don't remove the marker forcibly from the queue since it's against
+ * the block-layer manner.  Instead, we put a invalidated mark on the marker.
+ * When the invalidated marker is found on the top of the queue, it is
+ * immediately removed from the queue, so it doesn't block dispatching.
+ * Because we have only one marker per mapped_device, we can't start another
+ * flush suspend until the invalidated marker is removed from the queue.
+ * So fail and return with -EBUSY in such a case.
+ */
 int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 {
        struct dm_table *map = NULL;
@@ -1579,6 +2382,11 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
                goto out_unlock;
        }
+        if (dm_request_based(md) && !dm_rq_suspend_available(md, noflush)) {
+                r = -EBUSY;
+                goto out_unlock;
+        }
        map = dm_get_table(md);
        /*
@@ -1591,24 +2399,14 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
        /* This does not get reverted if there's an error later. */
        dm_table_presuspend_targets(map);
-        /* bdget() can stall if the pending I/Os are not flushed */
+        /*
-        if (!noflush) {
+         * Flush I/O to the device. noflush supersedes do_lockfs,
-                md->suspended_bdev = bdget_disk(md->disk, 0);
+         * because lock_fs() needs to flush I/Os.
-                if (!md->suspended_bdev) {
+         */
-                        DMWARN("bdget failed in dm_suspend");
+        if (!noflush && do_lockfs) {
-                        r = -ENOMEM;
+                r = lock_fs(md);
+                if (r)
                        goto out;
-                }
-                /*
-                 * Flush I/O to the device. noflush supersedes do_lockfs,
-                 * because lock_fs() needs to flush I/Os.
-                 */
-                if (do_lockfs) {
-                        r = lock_fs(md);
-                        if (r)
-                                goto out;
-                }
        }
        /*
@@ -1634,6 +2432,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
        flush_workqueue(md->wq);
+        if (dm_request_based(md))
+                dm_rq_start_suspend(md, noflush);
        /*
         * At this point no more requests are entering target request routines.
         * We call dm_wait_for_completion to wait for all existing requests
@@ -1650,6 +2451,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
        if (r < 0) {
                dm_queue_flush(md);
+                if (dm_request_based(md))
+                        dm_rq_abort_suspend(md, noflush);
                unlock_fs(md);
                goto out; /* pushback list is already flushed, so skip flush */
        }
@@ -1665,11 +2469,6 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
        set_bit(DMF_SUSPENDED, &md->flags);
 out:
-        if (r && md->suspended_bdev) {
-                bdput(md->suspended_bdev);
-                md->suspended_bdev = NULL;
-        }
        dm_table_put(map);
 out_unlock:
@@ -1696,21 +2495,20 @@ int dm_resume(struct mapped_device *md)
        dm_queue_flush(md);
-        unlock_fs(md);
+        /*
+         * Flushing deferred I/Os must be done after targets are resumed
+         * so that mapping of targets can work correctly.
+         * Request-based dm is queueing the deferred I/Os in its request_queue.
+         */
+        if (dm_request_based(md))
+                start_queue(md->queue);
-        if (md->suspended_bdev) {
+        unlock_fs(md);
-                bdput(md->suspended_bdev);
-                md->suspended_bdev = NULL;
-        }
        clear_bit(DMF_SUSPENDED, &md->flags);
        dm_table_unplug_all(map);
-        dm_kobject_uevent(md);
        r = 0;
 out:
        dm_table_put(map);
        mutex_unlock(&md->suspend_lock);
@@ -1721,9 +2519,19 @@ out:
 /*-----------------------------------------------------------------
 * Event notification.
 *---------------------------------------------------------------*/
-void dm_kobject_uevent(struct mapped_device *md)
+void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
-{
+                       unsigned cookie)
-        kobject_uevent(&disk_to_dev(md->disk)->kobj, KOBJ_CHANGE);
+{
+        char udev_cookie[DM_COOKIE_LENGTH];
+        char *envp[] = { udev_cookie, NULL };
+        if (!cookie)
+                kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
+        else {
+                snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
+                         DM_COOKIE_ENV_VAR_NAME, cookie);
+                kobject_uevent_env(&disk_to_dev(md->disk)->kobj, action, envp);
+        }
 }
 uint32_t dm_next_uevent_seq(struct mapped_device *md)
@@ -1777,6 +2585,10 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
        if (&md->kobj != kobj)
                return NULL;
+        if (test_bit(DMF_FREEING, &md->flags) ||
+            test_bit(DMF_DELETING, &md->flags))
+                return NULL;
        dm_get(md);
        return md;
 }
@@ -1797,6 +2609,61 @@ int dm_noflush_suspending(struct dm_target *ti)
 }
 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
+struct dm_md_mempools *dm_alloc_md_mempools(unsigned type)
+{
+        struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL);
+        if (!pools)
+                return NULL;
+        pools->io_pool = (type == DM_TYPE_BIO_BASED) ?
+                         mempool_create_slab_pool(MIN_IOS, _io_cache) :
+                         mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache);
+        if (!pools->io_pool)
+                goto free_pools_and_out;
+        pools->tio_pool = (type == DM_TYPE_BIO_BASED) ?
+                          mempool_create_slab_pool(MIN_IOS, _tio_cache) :
+                          mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
+        if (!pools->tio_pool)
+                goto free_io_pool_and_out;
+        pools->bs = (type == DM_TYPE_BIO_BASED) ?
+                    bioset_create(16, 0) : bioset_create(MIN_IOS, 0);
+        if (!pools->bs)
+                goto free_tio_pool_and_out;
+        return pools;
+free_tio_pool_and_out:
+        mempool_destroy(pools->tio_pool);
+free_io_pool_and_out:
+        mempool_destroy(pools->io_pool);
+free_pools_and_out:
+        kfree(pools);
+        return NULL;
+}
+void dm_free_md_mempools(struct dm_md_mempools *pools)
+{
+        if (!pools)
+                return;
+        if (pools->io_pool)
+                mempool_destroy(pools->io_pool);
+        if (pools->tio_pool)
+                mempool_destroy(pools->tio_pool);
+        if (pools->bs)
+                bioset_free(pools->bs);
+        kfree(pools);
+}
 static struct block_device_operations dm_blk_dops = {
        .open = dm_blk_open,
        .release = dm_blk_close,
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index a31506d93e91..23278ae80f08 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -23,6 +23,13 @@
 #define DM_SUSPEND_NOFLUSH_FLAG         (1 << 1)
 /*
+ * Type of table and mapped_device's mempool
+ */
+#define DM_TYPE_NONE            0
+#define DM_TYPE_BIO_BASED       1
+#define DM_TYPE_REQUEST_BASED   2
+/*
 * List of devices that a metadevice uses and should open/close.
 */
 struct dm_dev_internal {
@@ -32,6 +39,7 @@ struct dm_dev_internal {
 };
 struct dm_table;
+struct dm_md_mempools;
 /*-----------------------------------------------------------------
 * Internal table functions.
@@ -41,18 +49,34 @@ void dm_table_event_callback(struct dm_table *t,
                             void (*fn)(void *), void *context);
 struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
 struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
-void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q);
+int dm_calculate_queue_limits(struct dm_table *table,
+                              struct queue_limits *limits);
+void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
+                               struct queue_limits *limits);
 struct list_head *dm_table_get_devices(struct dm_table *t);
 void dm_table_presuspend_targets(struct dm_table *t);
 void dm_table_postsuspend_targets(struct dm_table *t);
 int dm_table_resume_targets(struct dm_table *t);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
+int dm_table_any_busy_target(struct dm_table *t);
+int dm_table_set_type(struct dm_table *t);
+unsigned dm_table_get_type(struct dm_table *t);
+bool dm_table_bio_based(struct dm_table *t);
+bool dm_table_request_based(struct dm_table *t);
+int dm_table_alloc_md_mempools(struct dm_table *t);
+void dm_table_free_md_mempools(struct dm_table *t);
+struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
 /*
 * To check the return value from dm_table_find_target().
 */
 #define dm_target_is_valid(t) ((t)->table)
+/*
+ * To check whether the target type is request-based or not (bio-based).
+ */
+#define dm_target_request_based(t) ((t)->type->map_rq != NULL)
 /*-----------------------------------------------------------------
 * A registry of target types.
 *---------------------------------------------------------------*/
@@ -92,9 +116,16 @@ void dm_stripe_exit(void);
 int dm_open_count(struct mapped_device *md);
 int dm_lock_for_deletion(struct mapped_device *md);
-void dm_kobject_uevent(struct mapped_device *md);
+void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
+                       unsigned cookie);
 int dm_kcopyd_init(void);
 void dm_kcopyd_exit(void);
+/*
+ * Mempool operations
+ */
+struct dm_md_mempools *dm_alloc_md_mempools(unsigned type);
+void dm_free_md_mempools(struct dm_md_mempools *pools);
 #endif
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 03f22076381f..334a3593cdfd 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -57,6 +57,7 @@ header-y += dlmconstants.h
 header-y += dlm_device.h
 header-y += dlm_netlink.h
 header-y += dm-ioctl.h
+header-y += dm-log-userspace.h
 header-y += dn.h
 header-y += dqblk_xfs.h
 header-y += efs_fs_sb.h
diff --git a/include/linux/connector.h b/include/linux/connector.h
index b9966e64604e..b68d27850d51 100644
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -41,8 +41,10 @@
 #define CN_IDX_BB                       0x5     /* BlackBoard, from the TSP GPL sampling framework */
 #define CN_DST_IDX                      0x6
 #define CN_DST_VAL                      0x1
+#define CN_IDX_DM                       0x7     /* Device Mapper */
+#define CN_VAL_DM_USERSPACE_LOG         0x1
-#define CN_NETLINK_USERS                7
+#define CN_NETLINK_USERS                8
 /*
 * Maximum connector's message size.
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 49c2362977fd..0d6310657f32 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -11,6 +11,7 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+struct dm_dev;
 struct dm_target;
 struct dm_table;
 struct mapped_device;
@@ -21,6 +22,7 @@ typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
 union map_info {
        void *ptr;
        unsigned long long ll;
+        unsigned flush_request;
 };
 /*
@@ -80,6 +82,15 @@ typedef int (*dm_ioctl_fn) (struct dm_target *ti, unsigned int cmd,
 typedef int (*dm_merge_fn) (struct dm_target *ti, struct bvec_merge_data *bvm,
                            struct bio_vec *biovec, int max_size);
+typedef int (*iterate_devices_callout_fn) (struct dm_target *ti,
+                                           struct dm_dev *dev,
+                                           sector_t physical_start,
+                                           void *data);
+typedef int (*dm_iterate_devices_fn) (struct dm_target *ti,
+                                      iterate_devices_callout_fn fn,
+                                      void *data);
 /*
 * Returns:
 *    0: The target can handle the next I/O immediately.
@@ -92,7 +103,8 @@ void dm_error(const char *message);
 /*
 * Combine device limits.
 */
-void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev);
+int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
+                         sector_t start, void *data);
 struct dm_dev {
        struct block_device *bdev;
@@ -138,23 +150,12 @@ struct target_type {
        dm_ioctl_fn ioctl;
        dm_merge_fn merge;
        dm_busy_fn busy;
+        dm_iterate_devices_fn iterate_devices;
        /* For internal device-mapper use. */
        struct list_head list;
 };
-struct io_restrictions {
-        unsigned long bounce_pfn;
-        unsigned long seg_boundary_mask;
-        unsigned max_hw_sectors;
-        unsigned max_sectors;
-        unsigned max_segment_size;
-        unsigned short logical_block_size;
-        unsigned short max_hw_segments;
-        unsigned short max_phys_segments;
-        unsigned char no_cluster; /* inverted so that 0 is default */
-};
 struct dm_target {
        struct dm_table *table;
        struct target_type *type;
@@ -163,15 +164,18 @@ struct dm_target {
        sector_t begin;
        sector_t len;
-        /* FIXME: turn this into a mask, and merge with io_restrictions */
        /* Always a power of 2 */
        sector_t split_io;
        /*
-         * These are automatically filled in by
+         * A number of zero-length barrier requests that will be submitted
-         * dm_table_get_device.
+         * to the target for the purpose of flushing cache.
+         *
+         * The request number will be placed in union map_info->flush_request.
+         * It is a responsibility of the target driver to remap these requests
+         * to the real underlying devices.
         */
-        struct io_restrictions limits;
+        unsigned num_flush_requests;
        /* target specific data */
        void *private;
@@ -230,6 +234,7 @@ struct gendisk *dm_disk(struct mapped_device *md);
 int dm_suspended(struct mapped_device *md);
 int dm_noflush_suspending(struct dm_target *ti);
 union map_info *dm_get_mapinfo(struct bio *bio);
+union map_info *dm_get_rq_mapinfo(struct request *rq);
 /*
 * Geometry functions.
@@ -392,4 +397,12 @@ static inline unsigned long to_bytes(sector_t n)
        return (n << SECTOR_SHIFT);
 }
+/*-----------------------------------------------------------------
+ * Helper for block layer and dm core operations
+ *---------------------------------------------------------------*/
+void dm_dispatch_request(struct request *rq);
+void dm_requeue_unmapped_request(struct request *rq);
+void dm_kill_unmapped_request(struct request *rq, int error);
+int dm_underlying_device_busy(struct request_queue *q);
 #endif  /* _LINUX_DEVICE_MAPPER_H */
diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h
index 48e44ee2b466..2ab84c83c31a 100644
--- a/include/linux/dm-ioctl.h
+++ b/include/linux/dm-ioctl.h
@@ -123,6 +123,16 @@ struct dm_ioctl {
        __u32 target_count;     /* in/out */
        __s32 open_count;       /* out */
        __u32 flags;            /* in/out */
+        /*
+         * event_nr holds either the event number (input and output) or the
+         * udev cookie value (input only).
+         * The DM_DEV_WAIT ioctl takes an event number as input.
+         * The DM_SUSPEND, DM_DEV_REMOVE and DM_DEV_RENAME ioctls
+         * use the field as a cookie to return in the DM_COOKIE
+         * variable with the uevents they issue.
+         * For output, the ioctls return the event number, not the cookie.
+         */
        __u32 event_nr;         /* in/out */
        __u32 padding;
@@ -256,9 +266,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY     _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 #define DM_VERSION_MAJOR        4
-#define DM_VERSION_MINOR        14
+#define DM_VERSION_MINOR        15
 #define DM_VERSION_PATCHLEVEL   0
-#define DM_VERSION_EXTRA        "-ioctl (2008-04-23)"
+#define DM_VERSION_EXTRA        "-ioctl (2009-04-01)"
 /* Status bits */
 #define DM_READONLY_FLAG        (1 << 0) /* In/Out */
diff --git a/include/linux/dm-log-userspace.h b/include/linux/dm-log-userspace.h
new file mode 100644
index 000000000000..642e3017b51f
--- /dev/null
+++ b/include/linux/dm-log-userspace.h
@@ -0,0 +1,386 @@
+/*
+ * Copyright (C) 2006-2009 Red Hat, Inc.
+ *
+ * This file is released under the LGPL.
+ */
+#ifndef __DM_LOG_USERSPACE_H__
+#define __DM_LOG_USERSPACE_H__
+#include <linux/dm-ioctl.h> /* For DM_UUID_LEN */
+/*
+ * The device-mapper userspace log module consists of a kernel component and
+ * a user-space component.  The kernel component implements the API defined
+ * in dm-dirty-log.h.  Its purpose is simply to pass the parameters and
+ * return values of those API functions between kernel and user-space.
+ *
+ * Below are defined the 'request_types' - DM_ULOG_CTR, DM_ULOG_DTR, etc.
+ * These request types represent the different functions in the device-mapper
+ * dirty log API.  Each of these is described in more detail below.
+ *
+ * The user-space program must listen for requests from the kernel (representing
+ * the various API functions) and process them.
+ *
+ * User-space begins by setting up the communication link (error checking
+ * removed for clarity):
+ *      fd = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR);
+ *      addr.nl_family = AF_NETLINK;
+ *      addr.nl_groups = CN_IDX_DM;
+ *      addr.nl_pid = 0;
+ *      r = bind(fd, (struct sockaddr *) &addr, sizeof(addr));
+ *      opt = addr.nl_groups;
+ *      setsockopt(fd, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, &opt, sizeof(opt));
+ *
+ * User-space will then wait to receive requests form the kernel, which it
+ * will process as described below.  The requests are received in the form,
+ * ((struct dm_ulog_request) + (additional data)).  Depending on the request
+ * type, there may or may not be 'additional data'.  In the descriptions below,
+ * you will see 'Payload-to-userspace' and 'Payload-to-kernel'.  The
+ * 'Payload-to-userspace' is what the kernel sends in 'additional data' as
+ * necessary parameters to complete the request.  The 'Payload-to-kernel' is
+ * the 'additional data' returned to the kernel that contains the necessary
+ * results of the request.  The 'data_size' field in the dm_ulog_request
+ * structure denotes the availability and amount of payload data.
+ */
+/*
+ * DM_ULOG_CTR corresponds to (found in dm-dirty-log.h):
+ * int (*ctr)(struct dm_dirty_log *log, struct dm_target *ti,
+ *            unsigned argc, char **argv);
+ *
+ * Payload-to-userspace:
+ *      A single string containing all the argv arguments separated by ' 's
+ * Payload-to-kernel:
+ *      None.  ('data_size' in the dm_ulog_request struct should be 0.)
+ *
+ * The UUID contained in the dm_ulog_request structure is the reference that
+ * will be used by all request types to a specific log.  The constructor must
+ * record this assotiation with instance created.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_CTR                    1
+/*
+ * DM_ULOG_DTR corresponds to (found in dm-dirty-log.h):
+ * void (*dtr)(struct dm_dirty_log *log);
+ *
+ * Payload-to-userspace:
+ *      A single string containing all the argv arguments separated by ' 's
+ * Payload-to-kernel:
+ *      None.  ('data_size' in the dm_ulog_request struct should be 0.)
+ *
+ * The UUID contained in the dm_ulog_request structure is all that is
+ * necessary to identify the log instance being destroyed.  There is no
+ * payload data.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and clearing
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_DTR                    2
+/*
+ * DM_ULOG_PRESUSPEND corresponds to (found in dm-dirty-log.h):
+ * int (*presuspend)(struct dm_dirty_log *log);
+ *
+ * Payload-to-userspace:
+ *      None.
+ * Payload-to-kernel:
+ *      None.
+ *
+ * The UUID contained in the dm_ulog_request structure is all that is
+ * necessary to identify the log instance being presuspended.  There is no
+ * payload data.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_PRESUSPEND             3
+/*
+ * DM_ULOG_POSTSUSPEND corresponds to (found in dm-dirty-log.h):
+ * int (*postsuspend)(struct dm_dirty_log *log);
+ *
+ * Payload-to-userspace:
+ *      None.
+ * Payload-to-kernel:
+ *      None.
+ *
+ * The UUID contained in the dm_ulog_request structure is all that is
+ * necessary to identify the log instance being postsuspended.  There is no
+ * payload data.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_POSTSUSPEND            4
+/*
+ * DM_ULOG_RESUME corresponds to (found in dm-dirty-log.h):
+ * int (*resume)(struct dm_dirty_log *log);
+ *
+ * Payload-to-userspace:
+ *      None.
+ * Payload-to-kernel:
+ *      None.
+ *
+ * The UUID contained in the dm_ulog_request structure is all that is
+ * necessary to identify the log instance being resumed.  There is no
+ * payload data.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_RESUME                 5
+/*
+ * DM_ULOG_GET_REGION_SIZE corresponds to (found in dm-dirty-log.h):
+ * uint32_t (*get_region_size)(struct dm_dirty_log *log);
+ *
+ * Payload-to-userspace:
+ *      None.
+ * Payload-to-kernel:
+ *      uint64_t - contains the region size
+ *
+ * The region size is something that was determined at constructor time.
+ * It is returned in the payload area and 'data_size' is set to
+ * reflect this.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field appropriately.
+ */
+#define DM_ULOG_GET_REGION_SIZE        6
+/*
+ * DM_ULOG_IS_CLEAN corresponds to (found in dm-dirty-log.h):
+ * int (*is_clean)(struct dm_dirty_log *log, region_t region);
+ *
+ * Payload-to-userspace:
+ *      uint64_t - the region to get clean status on
+ * Payload-to-kernel:
+ *      int64_t  - 1 if clean, 0 otherwise
+ *
+ * Payload is sizeof(uint64_t) and contains the region for which the clean
+ * status is being made.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - filling the payload with 0 (not clean) or
+ * 1 (clean), setting 'data_size' and 'error' appropriately.
+ */
+#define DM_ULOG_IS_CLEAN               7
+/*
+ * DM_ULOG_IN_SYNC corresponds to (found in dm-dirty-log.h):
+ * int (*in_sync)(struct dm_dirty_log *log, region_t region,
+ *                int can_block);
+ *
+ * Payload-to-userspace:
+ *      uint64_t - the region to get sync status on
+ * Payload-to-kernel:
+ *      int64_t - 1 if in-sync, 0 otherwise
+ *
+ * Exactly the same as 'is_clean' above, except this time asking "has the
+ * region been recovered?" vs. "is the region not being modified?"
+ */
+#define DM_ULOG_IN_SYNC                8
+/*
+ * DM_ULOG_FLUSH corresponds to (found in dm-dirty-log.h):
+ * int (*flush)(struct dm_dirty_log *log);
+ *
+ * Payload-to-userspace:
+ *      None.
+ * Payload-to-kernel:
+ *      None.
+ *
+ * No incoming or outgoing payload.  Simply flush log state to disk.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and clearing
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_FLUSH                  9
+/*
+ * DM_ULOG_MARK_REGION corresponds to (found in dm-dirty-log.h):
+ * void (*mark_region)(struct dm_dirty_log *log, region_t region);
+ *
+ * Payload-to-userspace:
+ *      uint64_t [] - region(s) to mark
+ * Payload-to-kernel:
+ *      None.
+ *
+ * Incoming payload contains the one or more regions to mark dirty.
+ * The number of regions contained in the payload can be determined from
+ * 'data_size/sizeof(uint64_t)'.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and clearing
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_MARK_REGION           10
+/*
+ * DM_ULOG_CLEAR_REGION corresponds to (found in dm-dirty-log.h):
+ * void (*clear_region)(struct dm_dirty_log *log, region_t region);
+ *
+ * Payload-to-userspace:
+ *      uint64_t [] - region(s) to clear
+ * Payload-to-kernel:
+ *      None.
+ *
+ * Incoming payload contains the one or more regions to mark clean.
+ * The number of regions contained in the payload can be determined from
+ * 'data_size/sizeof(uint64_t)'.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and clearing
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_CLEAR_REGION          11
+/*
+ * DM_ULOG_GET_RESYNC_WORK corresponds to (found in dm-dirty-log.h):
+ * int (*get_resync_work)(struct dm_dirty_log *log, region_t *region);
+ *
+ * Payload-to-userspace:
+ *      None.
+ * Payload-to-kernel:
+ *      {
+ *              int64_t i; -- 1 if recovery necessary, 0 otherwise
+ *              uint64_t r; -- The region to recover if i=1
+ *      }
+ * 'data_size' should be set appropriately.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field appropriately.
+ */
+#define DM_ULOG_GET_RESYNC_WORK       12
+/*
+ * DM_ULOG_SET_REGION_SYNC corresponds to (found in dm-dirty-log.h):
+ * void (*set_region_sync)(struct dm_dirty_log *log,
+ *                         region_t region, int in_sync);
+ *
+ * Payload-to-userspace:
+ *      {
+ *              uint64_t - region to set sync state on
+ *              int64_t  - 0 if not-in-sync, 1 if in-sync
+ *      }
+ * Payload-to-kernel:
+ *      None.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and clearing
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_SET_REGION_SYNC       13
+/*
+ * DM_ULOG_GET_SYNC_COUNT corresponds to (found in dm-dirty-log.h):
+ * region_t (*get_sync_count)(struct dm_dirty_log *log);
+ *
+ * Payload-to-userspace:
+ *      None.
+ * Payload-to-kernel:
+ *      uint64_t - the number of in-sync regions
+ *
+ * No incoming payload.  Kernel-bound payload contains the number of
+ * regions that are in-sync (in a size_t).
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_GET_SYNC_COUNT        14
+/*
+ * DM_ULOG_STATUS_INFO corresponds to (found in dm-dirty-log.h):
+ * int (*status)(struct dm_dirty_log *log, STATUSTYPE_INFO,
+ *               char *result, unsigned maxlen);
+ *
+ * Payload-to-userspace:
+ *      None.
+ * Payload-to-kernel:
+ *      Character string containing STATUSTYPE_INFO
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_STATUS_INFO           15
+/*
+ * DM_ULOG_STATUS_TABLE corresponds to (found in dm-dirty-log.h):
+ * int (*status)(struct dm_dirty_log *log, STATUSTYPE_TABLE,
+ *               char *result, unsigned maxlen);
+ *
+ * Payload-to-userspace:
+ *      None.
+ * Payload-to-kernel:
+ *      Character string containing STATUSTYPE_TABLE
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_STATUS_TABLE          16
+/*
+ * DM_ULOG_IS_REMOTE_RECOVERING corresponds to (found in dm-dirty-log.h):
+ * int (*is_remote_recovering)(struct dm_dirty_log *log, region_t region);
+ *
+ * Payload-to-userspace:
+ *      uint64_t - region to determine recovery status on
+ * Payload-to-kernel:
+ *      {
+ *              int64_t is_recovering;  -- 0 if no, 1 if yes
+ *              uint64_t in_sync_hint;  -- lowest region still needing resync
+ *      }
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_IS_REMOTE_RECOVERING  17
+/*
+ * (DM_ULOG_REQUEST_MASK & request_type) to get the request type
+ *
+ * Payload-to-userspace:
+ *      A single string containing all the argv arguments separated by ' 's
+ * Payload-to-kernel:
+ *      None.  ('data_size' in the dm_ulog_request struct should be 0.)
+ *
+ * We are reserving 8 bits of the 32-bit 'request_type' field for the
+ * various request types above.  The remaining 24-bits are currently
+ * set to zero and are reserved for future use and compatibility concerns.
+ *
+ * User-space should always use DM_ULOG_REQUEST_TYPE to aquire the
+ * request type from the 'request_type' field to maintain forward compatibility.
+ */
+#define DM_ULOG_REQUEST_MASK 0xFF
+#define DM_ULOG_REQUEST_TYPE(request_type) \
+        (DM_ULOG_REQUEST_MASK & (request_type))
+struct dm_ulog_request {
+        char uuid[DM_UUID_LEN]; /* Ties a request to a specific mirror log */
+        char padding[7];        /* Padding because DM_UUID_LEN = 129 */
+        int32_t error;          /* Used to report back processing errors */
+        uint32_t seq;           /* Sequence number for request */
+        uint32_t request_type;  /* DM_ULOG_* defined above */
+        uint32_t data_size;     /* How much data (not including this struct) */
+        char data[0];
+};
+#endif /* __DM_LOG_USERSPACE_H__ */