Merge tag 'dm-4.1-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

Pull device mapper updates from Mike Snitzer: - the most extensive changes this cycle are the DM core improvements to add full blk-mq support to request-based DM. - disabled by default but user can opt-in with CONFIG_DM_MQ_DEFAULT - depends on some blk-mq changes from Jens' for-4.1/core branch so that explains why this pull is built on linux-block.git - update DM to use name_to_dev_t() rather than open-coding a less capable device parser. - includes a couple small improvements to name_to_dev_t() that offer stricter constraints that DM's code provided. - improvements to the dm-cache "mq" cache replacement policy. - a DM crypt crypt_ctr() error path fix and an async crypto deadlock fix - a small efficiency improvement for DM crypt decryption by leveraging immutable biovecs - add error handling modes for corrupted blocks to DM verity - a new "log-writes" DM target from Josef Bacik that is meant for file system developers to test file system integrity at particular points in the life of a file system - a few DM log userspace cleanups and fixes - a few Documentation fixes (for thin, cache, crypt and switch) * tag 'dm-4.1-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (34 commits) dm crypt: fix missing error code return from crypt_ctr error path dm crypt: fix deadlock when async crypto algorithm returns -EBUSY dm crypt: leverage immutable biovecs when decrypting on read dm crypt: update URLs to new cryptsetup project page dm: add log writes target dm table: use bool function return values of true/false not 1/0 dm verity: add error handling modes for corrupted blocks dm thin: remove stale 'trim' message documentation dm delay: use msecs_to_jiffies for time conversion dm log userspace base: fix compile warning dm log userspace transfer: match wait_for_completion_timeout return type dm table: fall back to getting device using name_to_dev_t() init: stricter checking of major:minor root= values init: export name_to_dev_t and mark name argument as const dm: add 'use_blk_mq' module param and expose in per-device ro sysfs attr dm: optimize dm_mq_queue_rq to _not_ use kthread if using pure blk-mq dm: add full blk-mq support to request-based DM dm: impose configurable deadline for dm_request_fn's merge heuristic dm sysfs: introduce ability to add writable attributes dm: don't start current request if it would've merged with the previous ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2015-04-18 08:14:18 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2015-04-18 08:14:18 -0400
commit: afad97eee47c1f1f242202e2473929b4ef5d9f43 (patch)
tree: 31f68d70760234b582a28bd3f64311ff5307b7b1 /drivers/md
parent: 04b7fe6a4a231871ef681bc95e08fe66992f7b1f (diff)
parent: 44c144f9c8e8fbd73ede2848da8253b3aae42ec2 (diff)
14 files changed, 1736 insertions, 324 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 63e05e32b462..6ddc983417d5 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -196,6 +196,17 @@ config BLK_DEV_DM
          If unsure, say N.
+config DM_MQ_DEFAULT
+        bool "request-based DM: use blk-mq I/O path by default"
+        depends on BLK_DEV_DM
+        ---help---
+          This option enables the blk-mq based I/O path for request-based
+          DM devices by default.  With the option the dm_mod.use_blk_mq
+          module/boot option defaults to Y, without it to N, but it can
+          still be overriden either way.
+          If unsure say N.
 config DM_DEBUG
        bool "Device mapper debugging support"
        depends on BLK_DEV_DM
@@ -432,4 +443,20 @@ config DM_SWITCH
          If unsure, say N.
+config DM_LOG_WRITES
+        tristate "Log writes target support"
+        depends on BLK_DEV_DM
+        ---help---
+          This device-mapper target takes two devices, one device to use
+          normally, one to log all write operations done to the first device.
+          This is for use by file system developers wishing to verify that
+          their fs is writing a consitent file system at all times by allowing
+          them to replay the log in a variety of ways and to check the
+          contents.
+          To compile this code as a module, choose M here: the module will
+          be called dm-log-writes.
+          If unsure, say N.
 endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index a2da532b1c2b..1863feaa5846 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_DM_CACHE)		+= dm-cache.o
 obj-$(CONFIG_DM_CACHE_MQ)       += dm-cache-mq.o
 obj-$(CONFIG_DM_CACHE_CLEANER)  += dm-cache-cleaner.o
 obj-$(CONFIG_DM_ERA)            += dm-era.o
+obj-$(CONFIG_DM_LOG_WRITES)     += dm-log-writes.o
 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs                     += dm-uevent.o
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 13f547a4eeb6..3ddd1162334d 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -8,6 +8,7 @@
 #include "dm.h"
 #include <linux/hash.h>
+#include <linux/jiffies.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
@@ -124,32 +125,41 @@ static void iot_examine_bio(struct io_tracker *t, struct bio *bio)
 * sorted queue.
 */
 #define NR_QUEUE_LEVELS 16u
+#define NR_SENTINELS NR_QUEUE_LEVELS * 3
+#define WRITEBACK_PERIOD HZ
 struct queue {
+        unsigned nr_elts;
+        bool current_writeback_sentinels;
+        unsigned long next_writeback;
        struct list_head qs[NR_QUEUE_LEVELS];
+        struct list_head sentinels[NR_SENTINELS];
 };
 static void queue_init(struct queue *q)
 {
        unsigned i;
-        for (i = 0; i < NR_QUEUE_LEVELS; i++)
+        q->nr_elts = 0;
+        q->current_writeback_sentinels = false;
+        q->next_writeback = 0;
+        for (i = 0; i < NR_QUEUE_LEVELS; i++) {
                INIT_LIST_HEAD(q->qs + i);
+                INIT_LIST_HEAD(q->sentinels + i);
+                INIT_LIST_HEAD(q->sentinels + NR_QUEUE_LEVELS + i);
+                INIT_LIST_HEAD(q->sentinels + (2 * NR_QUEUE_LEVELS) + i);
+        }
 }
-/*
+static unsigned queue_size(struct queue *q)
- * Checks to see if the queue is empty.
- * FIXME: reduce cpu usage.
- */
-static bool queue_empty(struct queue *q)
 {
-        unsigned i;
+        return q->nr_elts;
+}
-        for (i = 0; i < NR_QUEUE_LEVELS; i++)
-                if (!list_empty(q->qs + i))
-                        return false;
-        return true;
+static bool queue_empty(struct queue *q)
+{
+        return q->nr_elts == 0;
 }
 /*
@@ -157,24 +167,19 @@ static bool queue_empty(struct queue *q)
 */
 static void queue_push(struct queue *q, unsigned level, struct list_head *elt)
 {
+        q->nr_elts++;
        list_add_tail(elt, q->qs + level);
 }
-static void queue_remove(struct list_head *elt)
+static void queue_remove(struct queue *q, struct list_head *elt)
 {
+        q->nr_elts--;
        list_del(elt);
 }
-/*
+static bool is_sentinel(struct queue *q, struct list_head *h)
- * Shifts all regions down one level.  This has no effect on the order of
- * the queue.
- */
-static void queue_shift_down(struct queue *q)
 {
-        unsigned level;
+        return (h >= q->sentinels) && (h < (q->sentinels + NR_SENTINELS));
-        for (level = 1; level < NR_QUEUE_LEVELS; level++)
-                list_splice_init(q->qs + level, q->qs + level - 1);
 }
 /*
@@ -184,10 +189,12 @@ static void queue_shift_down(struct queue *q)
 static struct list_head *queue_peek(struct queue *q)
 {
        unsigned level;
+        struct list_head *h;
        for (level = 0; level < NR_QUEUE_LEVELS; level++)
-                if (!list_empty(q->qs + level))
+                list_for_each(h, q->qs + level)
-                        return q->qs[level].next;
+                        if (!is_sentinel(q, h))
+                                return h;
        return NULL;
 }
@@ -197,16 +204,34 @@ static struct list_head *queue_pop(struct queue *q)
        struct list_head *r = queue_peek(q);
        if (r) {
+                q->nr_elts--;
                list_del(r);
-                /* have we just emptied the bottom level? */
-                if (list_empty(q->qs))
-                        queue_shift_down(q);
        }
        return r;
 }
+/*
+ * Pops an entry from a level that is not past a sentinel.
+ */
+static struct list_head *queue_pop_old(struct queue *q)
+{
+        unsigned level;
+        struct list_head *h;
+        for (level = 0; level < NR_QUEUE_LEVELS; level++)
+                list_for_each(h, q->qs + level) {
+                        if (is_sentinel(q, h))
+                                break;
+                        q->nr_elts--;
+                        list_del(h);
+                        return h;
+                }
+        return NULL;
+}
 static struct list_head *list_pop(struct list_head *lh)
 {
        struct list_head *r = lh->next;
@@ -217,6 +242,62 @@ static struct list_head *list_pop(struct list_head *lh)
        return r;
 }
+static struct list_head *writeback_sentinel(struct queue *q, unsigned level)
+{
+        if (q->current_writeback_sentinels)
+                return q->sentinels + NR_QUEUE_LEVELS + level;
+        else
+                return q->sentinels + 2 * NR_QUEUE_LEVELS + level;
+}
+static void queue_update_writeback_sentinels(struct queue *q)
+{
+        unsigned i;
+        struct list_head *h;
+        if (time_after(jiffies, q->next_writeback)) {
+                for (i = 0; i < NR_QUEUE_LEVELS; i++) {
+                        h = writeback_sentinel(q, i);
+                        list_del(h);
+                        list_add_tail(h, q->qs + i);
+                }
+                q->next_writeback = jiffies + WRITEBACK_PERIOD;
+                q->current_writeback_sentinels = !q->current_writeback_sentinels;
+        }
+}
+/*
+ * Sometimes we want to iterate through entries that have been pushed since
+ * a certain event.  We use sentinel entries on the queues to delimit these
+ * 'tick' events.
+ */
+static void queue_tick(struct queue *q)
+{
+        unsigned i;
+        for (i = 0; i < NR_QUEUE_LEVELS; i++) {
+                list_del(q->sentinels + i);
+                list_add_tail(q->sentinels + i, q->qs + i);
+        }
+}
+typedef void (*iter_fn)(struct list_head *, void *);
+static void queue_iterate_tick(struct queue *q, iter_fn fn, void *context)
+{
+        unsigned i;
+        struct list_head *h;
+        for (i = 0; i < NR_QUEUE_LEVELS; i++) {
+                list_for_each_prev(h, q->qs + i) {
+                        if (is_sentinel(q, h))
+                                break;
+                        fn(h, context);
+                }
+        }
+}
 /*----------------------------------------------------------------*/
 /*
@@ -232,8 +313,6 @@ struct entry {
         */
        bool dirty:1;
        unsigned hit_count;
-        unsigned generation;
-        unsigned tick;
 };
 /*
@@ -481,7 +560,6 @@ static bool in_cache(struct mq_policy *mq, struct entry *e)
 */
 static void push(struct mq_policy *mq, struct entry *e)
 {
-        e->tick = mq->tick;
        hash_insert(mq, e);
        if (in_cache(mq, e))
@@ -496,7 +574,11 @@ static void push(struct mq_policy *mq, struct entry *e)
 */
 static void del(struct mq_policy *mq, struct entry *e)
 {
-        queue_remove(&e->list);
+        if (in_cache(mq, e))
+                queue_remove(e->dirty ? &mq->cache_dirty : &mq->cache_clean, &e->list);
+        else
+                queue_remove(&mq->pre_cache, &e->list);
        hash_remove(e);
 }
@@ -518,18 +600,24 @@ static struct entry *pop(struct mq_policy *mq, struct queue *q)
        return e;
 }
-static struct entry *peek(struct queue *q)
+static struct entry *pop_old(struct mq_policy *mq, struct queue *q)
 {
-        struct list_head *h = queue_peek(q);
+        struct entry *e;
-        return h ? container_of(h, struct entry, list) : NULL;
+        struct list_head *h = queue_pop_old(q);
+        if (!h)
+                return NULL;
+        e = container_of(h, struct entry, list);
+        hash_remove(e);
+        return e;
 }
-/*
+static struct entry *peek(struct queue *q)
- * Has this entry already been updated?
- */
-static bool updated_this_tick(struct mq_policy *mq, struct entry *e)
 {
-        return mq->tick == e->tick;
+        struct list_head *h = queue_peek(q);
+        return h ? container_of(h, struct entry, list) : NULL;
 }
 /*
@@ -583,20 +671,9 @@ static void check_generation(struct mq_policy *mq)
 * Whenever we use an entry we bump up it's hit counter, and push it to the
 * back to it's current level.
 */
-static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e)
+static void requeue(struct mq_policy *mq, struct entry *e)
 {
-        if (updated_this_tick(mq, e))
-                return;
-        e->hit_count++;
-        mq->hit_count++;
        check_generation(mq);
-        /* generation adjustment, to stop the counts increasing forever. */
-        /* FIXME: divide? */
-        /* e->hit_count -= min(e->hit_count - 1, mq->generation - e->generation); */
-        e->generation = mq->generation;
        del(mq, e);
        push(mq, e);
 }
@@ -703,7 +780,7 @@ static int cache_entry_found(struct mq_policy *mq,
                             struct entry *e,
                             struct policy_result *result)
 {
-        requeue_and_update_tick(mq, e);
+        requeue(mq, e);
        if (in_cache(mq, e)) {
                result->op = POLICY_HIT;
@@ -740,8 +817,6 @@ static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
        new_e->oblock = e->oblock;
        new_e->dirty = false;
        new_e->hit_count = e->hit_count;
-        new_e->generation = e->generation;
-        new_e->tick = e->tick;
        del(mq, e);
        free_entry(&mq->pre_cache_pool, e);
@@ -757,18 +832,16 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
                                 int data_dir, struct policy_result *result)
 {
        int r = 0;
-        bool updated = updated_this_tick(mq, e);
-        if ((!discarded_oblock && updated) ||
+        if (!should_promote(mq, e, discarded_oblock, data_dir)) {
-            !should_promote(mq, e, discarded_oblock, data_dir)) {
+                requeue(mq, e);
-                requeue_and_update_tick(mq, e);
                result->op = POLICY_MISS;
        } else if (!can_migrate)
                r = -EWOULDBLOCK;
        else {
-                requeue_and_update_tick(mq, e);
+                requeue(mq, e);
                r = pre_cache_to_cache(mq, e, result);
        }
@@ -795,7 +868,6 @@ static void insert_in_pre_cache(struct mq_policy *mq,
        e->dirty = false;
        e->oblock = oblock;
        e->hit_count = 1;
-        e->generation = mq->generation;
        push(mq, e);
 }
@@ -828,7 +900,6 @@ static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
        e->oblock = oblock;
        e->dirty = false;
        e->hit_count = 1;
-        e->generation = mq->generation;
        push(mq, e);
        result->cblock = infer_cblock(&mq->cache_pool, e);
@@ -905,12 +976,37 @@ static void mq_destroy(struct dm_cache_policy *p)
        kfree(mq);
 }
+static void update_pre_cache_hits(struct list_head *h, void *context)
+{
+        struct entry *e = container_of(h, struct entry, list);
+        e->hit_count++;
+}
+static void update_cache_hits(struct list_head *h, void *context)
+{
+        struct mq_policy *mq = context;
+        struct entry *e = container_of(h, struct entry, list);
+        e->hit_count++;
+        mq->hit_count++;
+}
 static void copy_tick(struct mq_policy *mq)
 {
-        unsigned long flags;
+        unsigned long flags, tick;
        spin_lock_irqsave(&mq->tick_lock, flags);
-        mq->tick = mq->tick_protected;
+        tick = mq->tick_protected;
+        if (tick != mq->tick) {
+                queue_iterate_tick(&mq->pre_cache, update_pre_cache_hits, mq);
+                queue_iterate_tick(&mq->cache_dirty, update_cache_hits, mq);
+                queue_iterate_tick(&mq->cache_clean, update_cache_hits, mq);
+                mq->tick = tick;
+        }
+        queue_tick(&mq->pre_cache);
+        queue_tick(&mq->cache_dirty);
+        queue_tick(&mq->cache_clean);
+        queue_update_writeback_sentinels(&mq->cache_dirty);
        spin_unlock_irqrestore(&mq->tick_lock, flags);
 }
@@ -1001,7 +1097,6 @@ static int mq_load_mapping(struct dm_cache_policy *p,
        e->oblock = oblock;
        e->dirty = false;       /* this gets corrected in a minute */
        e->hit_count = hint_valid ? hint : 1;
-        e->generation = mq->generation;
        push(mq, e);
        return 0;
@@ -1012,10 +1107,15 @@ static int mq_save_hints(struct mq_policy *mq, struct queue *q,
 {
        int r;
        unsigned level;
+        struct list_head *h;
        struct entry *e;
        for (level = 0; level < NR_QUEUE_LEVELS; level++)
-                list_for_each_entry(e, q->qs + level, list) {
+                list_for_each(h, q->qs + level) {
+                        if (is_sentinel(q, h))
+                                continue;
+                        e = container_of(h, struct entry, list);
                        r = fn(context, infer_cblock(&mq->cache_pool, e),
                               e->oblock, e->hit_count);
                        if (r)
@@ -1087,10 +1187,27 @@ static int mq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
        return r;
 }
+#define CLEAN_TARGET_PERCENTAGE 25
+static bool clean_target_met(struct mq_policy *mq)
+{
+        /*
+         * Cache entries may not be populated.  So we're cannot rely on the
+         * size of the clean queue.
+         */
+        unsigned nr_clean = from_cblock(mq->cache_size) - queue_size(&mq->cache_dirty);
+        unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_PERCENTAGE / 100;
+        return nr_clean >= target;
+}
 static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock,
                              dm_cblock_t *cblock)
 {
-        struct entry *e = pop(mq, &mq->cache_dirty);
+        struct entry *e = pop_old(mq, &mq->cache_dirty);
+        if (!e && !clean_target_met(mq))
+                e = pop(mq, &mq->cache_dirty);
        if (!e)
                return -ENODATA;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 713a96237a80..9eeea196328a 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -228,7 +228,7 @@ static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
 *
 * tcw:  Compatible implementation of the block chaining mode used
 *       by the TrueCrypt device encryption system (prior to version 4.1).
- *       For more info see: http://www.truecrypt.org
+ *       For more info see: https://gitlab.com/cryptsetup/cryptsetup/wikis/TrueCryptOnDiskFormat
 *       It operates on full 512 byte sectors and uses CBC
 *       with an IV derived from initial key and the sector number.
 *       In addition, whitening value is applied on every sector, whitening
@@ -925,11 +925,10 @@ static int crypt_convert(struct crypt_config *cc,
                switch (r) {
                /* async */
+                case -EINPROGRESS:
                case -EBUSY:
                        wait_for_completion(&ctx->restart);
                        reinit_completion(&ctx->restart);
-                        /* fall through*/
-                case -EINPROGRESS:
                        ctx->req = NULL;
                        ctx->cc_sector++;
                        continue;
@@ -1124,15 +1123,15 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
 static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
 {
        struct crypt_config *cc = io->cc;
-        struct bio *base_bio = io->base_bio;
        struct bio *clone;
        /*
-         * The block layer might modify the bvec array, so always
+         * We need the original biovec array in order to decrypt
-         * copy the required bvecs because we need the original
+         * the whole bio data *afterwards* -- thanks to immutable
-         * one in order to decrypt the whole bio data *afterwards*.
+         * biovecs we don't need to worry about the block layer
+         * modifying the biovec array; so leverage bio_clone_fast().
         */
-        clone = bio_clone_bioset(base_bio, gfp, cc->bs);
+        clone = bio_clone_fast(io->base_bio, gfp, cc->bs);
        if (!clone)
                return 1;
@@ -1346,10 +1345,8 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
        struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
        struct crypt_config *cc = io->cc;
-        if (error == -EINPROGRESS) {
+        if (error == -EINPROGRESS)
-                complete(&ctx->restart);
                return;
-        }
        if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
                error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
@@ -1360,12 +1357,15 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
        crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio);
        if (!atomic_dec_and_test(&ctx->cc_pending))
-                return;
+                goto done;
        if (bio_data_dir(io->base_bio) == READ)
                kcryptd_crypt_read_done(io);
        else
                kcryptd_crypt_write_io_submit(io, 1);
+done:
+        if (!completion_done(&ctx->restart))
+                complete(&ctx->restart);
 }
 static void kcryptd_crypt(struct work_struct *work)
@@ -1816,6 +1816,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
                if (ret)
                        goto bad;
+                ret = -EINVAL;
                while (opt_params--) {
                        opt_string = dm_shift_arg(&as);
                        if (!opt_string) {
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 42c3a27a14cc..57b6a1901c91 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -236,7 +236,7 @@ static int delay_bio(struct delay_c *dc, int delay, struct bio *bio)
        delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info));
        delayed->context = dc;
-        delayed->expires = expires = jiffies + (delay * HZ / 1000);
+        delayed->expires = expires = jiffies + msecs_to_jiffies(delay);
        mutex_lock(&delayed_bios_lock);
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 03177ca0b009..058256d2eeea 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -17,7 +17,9 @@
 #define DM_LOG_USERSPACE_VSN "1.3.0"
-struct flush_entry {
+#define FLUSH_ENTRY_POOL_SIZE 16
+struct dm_dirty_log_flush_entry {
        int type;
        region_t region;
        struct list_head list;
@@ -34,22 +36,14 @@ struct flush_entry {
 struct log_c {
        struct dm_target *ti;
        struct dm_dev *log_dev;
-        uint32_t region_size;
-        region_t region_count;
-        uint64_t luid;
-        char uuid[DM_UUID_LEN];
        char *usr_argv_str;
        uint32_t usr_argc;
-        /*
+        uint32_t region_size;
-         * in_sync_hint gets set when doing is_remote_recovering.  It
+        region_t region_count;
-         * represents the first region that needs recovery.  IOW, the
+        uint64_t luid;
-         * first zero bit of sync_bits.  This can be useful for to limit
+        char uuid[DM_UUID_LEN];
-         * traffic for calls like is_remote_recovering and get_resync_work,
-         * but be take care in its use for anything else.
-         */
-        uint64_t in_sync_hint;
        /*
         * Mark and clear requests are held until a flush is issued
@@ -62,6 +56,15 @@ struct log_c {
        struct list_head clear_list;
        /*
+         * in_sync_hint gets set when doing is_remote_recovering.  It
+         * represents the first region that needs recovery.  IOW, the
+         * first zero bit of sync_bits.  This can be useful for to limit
+         * traffic for calls like is_remote_recovering and get_resync_work,
+         * but be take care in its use for anything else.
+         */
+        uint64_t in_sync_hint;
+        /*
         * Workqueue for flush of clear region requests.
         */
        struct workqueue_struct *dmlog_wq;
@@ -72,19 +75,11 @@ struct log_c {
         * Combine userspace flush and mark requests for efficiency.
         */
        uint32_t integrated_flush;
-};
-static mempool_t *flush_entry_pool;
-static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data)
+        mempool_t *flush_entry_pool;
-{
+};
-        return kmalloc(sizeof(struct flush_entry), gfp_mask);
-}
-static void flush_entry_free(void *element, void *pool_data)
+static struct kmem_cache *_flush_entry_cache;
-{
-        kfree(element);
-}
 static int userspace_do_request(struct log_c *lc, const char *uuid,
                                int request_type, char *data, size_t data_size,
@@ -254,6 +249,14 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
                goto out;
        }
+        lc->flush_entry_pool = mempool_create_slab_pool(FLUSH_ENTRY_POOL_SIZE,
+                                                        _flush_entry_cache);
+        if (!lc->flush_entry_pool) {
+                DMERR("Failed to create flush_entry_pool");
+                r = -ENOMEM;
+                goto out;
+        }
        /*
         * Send table string and get back any opened device.
         */
@@ -310,6 +313,8 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 out:
        kfree(devices_rdata);
        if (r) {
+                if (lc->flush_entry_pool)
+                        mempool_destroy(lc->flush_entry_pool);
                kfree(lc);
                kfree(ctr_str);
        } else {
@@ -338,6 +343,8 @@ static void userspace_dtr(struct dm_dirty_log *log)
        if (lc->log_dev)
                dm_put_device(lc->ti, lc->log_dev);
+        mempool_destroy(lc->flush_entry_pool);
        kfree(lc->usr_argv_str);
        kfree(lc);
@@ -461,7 +468,7 @@ static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
 static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list)
 {
        int r = 0;
-        struct flush_entry *fe;
+        struct dm_dirty_log_flush_entry *fe;
        list_for_each_entry(fe, flush_list, list) {
                r = userspace_do_request(lc, lc->uuid, fe->type,
@@ -481,7 +488,7 @@ static int flush_by_group(struct log_c *lc, struct list_head *flush_list,
        int r = 0;
        int count;
        uint32_t type = 0;
-        struct flush_entry *fe, *tmp_fe;
+        struct dm_dirty_log_flush_entry *fe, *tmp_fe;
        LIST_HEAD(tmp_list);
        uint64_t group[MAX_FLUSH_GROUP_COUNT];
@@ -563,7 +570,8 @@ static int userspace_flush(struct dm_dirty_log *log)
        LIST_HEAD(clear_list);
        int mark_list_is_empty;
        int clear_list_is_empty;
-        struct flush_entry *fe, *tmp_fe;
+        struct dm_dirty_log_flush_entry *fe, *tmp_fe;
+        mempool_t *flush_entry_pool = lc->flush_entry_pool;
        spin_lock_irqsave(&lc->flush_lock, flags);
        list_splice_init(&lc->mark_list, &mark_list);
@@ -643,10 +651,10 @@ static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
 {
        unsigned long flags;
        struct log_c *lc = log->context;
-        struct flush_entry *fe;
+        struct dm_dirty_log_flush_entry *fe;
        /* Wait for an allocation, but _never_ fail */
-        fe = mempool_alloc(flush_entry_pool, GFP_NOIO);
+        fe = mempool_alloc(lc->flush_entry_pool, GFP_NOIO);
        BUG_ON(!fe);
        spin_lock_irqsave(&lc->flush_lock, flags);
@@ -672,7 +680,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
 {
        unsigned long flags;
        struct log_c *lc = log->context;
-        struct flush_entry *fe;
+        struct dm_dirty_log_flush_entry *fe;
        /*
         * If we fail to allocate, we skip the clearing of
@@ -680,7 +688,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
         * to cause the region to be resync'ed when the
         * device is activated next time.
         */
-        fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC);
+        fe = mempool_alloc(lc->flush_entry_pool, GFP_ATOMIC);
        if (!fe) {
                DMERR("Failed to allocate memory to clear region.");
                return;
@@ -733,7 +741,6 @@ static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region)
 static void userspace_set_region_sync(struct dm_dirty_log *log,
                                      region_t region, int in_sync)
 {
-        int r;
        struct log_c *lc = log->context;
        struct {
                region_t r;
@@ -743,12 +750,12 @@ static void userspace_set_region_sync(struct dm_dirty_log *log,
        pkg.r = region;
        pkg.i = (int64_t)in_sync;
-        r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
+        (void) userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
-                                 (char *)&pkg, sizeof(pkg), NULL, NULL);
+                                    (char *)&pkg, sizeof(pkg), NULL, NULL);
        /*
         * It would be nice to be able to report failures.
-         * However, it is easy emough to detect and resolve.
+         * However, it is easy enough to detect and resolve.
         */
        return;
 }
@@ -886,18 +893,16 @@ static int __init userspace_dirty_log_init(void)
 {
        int r = 0;
-        flush_entry_pool = mempool_create(100, flush_entry_alloc,
+        _flush_entry_cache = KMEM_CACHE(dm_dirty_log_flush_entry, 0);
-                                          flush_entry_free, NULL);
+        if (!_flush_entry_cache) {
+                DMWARN("Unable to create flush_entry_cache: No memory.");
-        if (!flush_entry_pool) {
-                DMWARN("Unable to create flush_entry_pool:  No memory.");
                return -ENOMEM;
        }
        r = dm_ulog_tfr_init();
        if (r) {
                DMWARN("Unable to initialize userspace log communications");
-                mempool_destroy(flush_entry_pool);
+                kmem_cache_destroy(_flush_entry_cache);
                return r;
        }
@@ -905,7 +910,7 @@ static int __init userspace_dirty_log_init(void)
        if (r) {
                DMWARN("Couldn't register userspace dirty log type");
                dm_ulog_tfr_exit();
-                mempool_destroy(flush_entry_pool);
+                kmem_cache_destroy(_flush_entry_cache);
                return r;
        }
@@ -917,7 +922,7 @@ static void __exit userspace_dirty_log_exit(void)
 {
        dm_dirty_log_type_unregister(&_userspace_type);
        dm_ulog_tfr_exit();
-        mempool_destroy(flush_entry_pool);
+        kmem_cache_destroy(_flush_entry_cache);
        DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded");
        return;
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
index 39ad9664d397..fdf8ec304f8d 100644
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -172,6 +172,7 @@ int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type,
                         char *rdata, size_t *rdata_size)
 {
        int r = 0;
+        unsigned long tmo;
        size_t dummy = 0;
        int overhead_size = sizeof(struct dm_ulog_request) + sizeof(struct cn_msg);
        struct dm_ulog_request *tfr = prealloced_ulog_tfr;
@@ -236,11 +237,11 @@ resend:
                goto out;
        }
-        r = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT);
+        tmo = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT);
        spin_lock(&receiving_list_lock);
        list_del_init(&(pkg.list));
        spin_unlock(&receiving_list_lock);
-        if (!r) {
+        if (!tmo) {
                DMWARN("[%s] Request timed out: [%u/%u] - retrying",
                       (strlen(uuid) > 8) ?
                       (uuid + (strlen(uuid) - 8)) : (uuid),
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
new file mode 100644
index 000000000000..93e08446a87d
--- /dev/null
+++ b/drivers/md/dm-log-writes.c
@@ -0,0 +1,825 @@
+/*
+ * Copyright (C) 2014 Facebook. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+#include <linux/device-mapper.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#define DM_MSG_PREFIX "log-writes"
+/*
+ * This target will sequentially log all writes to the target device onto the
+ * log device.  This is helpful for replaying writes to check for fs consistency
+ * at all times.  This target provides a mechanism to mark specific events to
+ * check data at a later time.  So for example you would:
+ *
+ * write data
+ * fsync
+ * dmsetup message /dev/whatever mark mymark
+ * unmount /mnt/test
+ *
+ * Then replay the log up to mymark and check the contents of the replay to
+ * verify it matches what was written.
+ *
+ * We log writes only after they have been flushed, this makes the log describe
+ * close to the order in which the data hits the actual disk, not its cache.  So
+ * for example the following sequence (W means write, C means complete)
+ *
+ * Wa,Wb,Wc,Cc,Ca,FLUSH,FUAd,Cb,CFLUSH,CFUAd
+ *
+ * Would result in the log looking like this:
+ *
+ * c,a,flush,fuad,b,<other writes>,<next flush>
+ *
+ * This is meant to help expose problems where file systems do not properly wait
+ * on data being written before invoking a FLUSH.  FUA bypasses cache so once it
+ * completes it is added to the log as it should be on disk.
+ *
+ * We treat DISCARDs as if they don't bypass cache so that they are logged in
+ * order of completion along with the normal writes.  If we didn't do it this
+ * way we would process all the discards first and then write all the data, when
+ * in fact we want to do the data and the discard in the order that they
+ * completed.
+ */
+#define LOG_FLUSH_FLAG (1 << 0)
+#define LOG_FUA_FLAG (1 << 1)
+#define LOG_DISCARD_FLAG (1 << 2)
+#define LOG_MARK_FLAG (1 << 3)
+#define WRITE_LOG_VERSION 1
+#define WRITE_LOG_MAGIC 0x6a736677736872
+/*
+ * The disk format for this is braindead simple.
+ *
+ * At byte 0 we have our super, followed by the following sequence for
+ * nr_entries:
+ *
+ * [   1 sector    ][  entry->nr_sectors ]
+ * [log_write_entry][    data written    ]
+ *
+ * The log_write_entry takes up a full sector so we can have arbitrary length
+ * marks and it leaves us room for extra content in the future.
+ */
+/*
+ * Basic info about the log for userspace.
+ */
+struct log_write_super {
+        __le64 magic;
+        __le64 version;
+        __le64 nr_entries;
+        __le32 sectorsize;
+};
+/*
+ * sector - the sector we wrote.
+ * nr_sectors - the number of sectors we wrote.
+ * flags - flags for this log entry.
+ * data_len - the size of the data in this log entry, this is for private log
+ * entry stuff, the MARK data provided by userspace for example.
+ */
+struct log_write_entry {
+        __le64 sector;
+        __le64 nr_sectors;
+        __le64 flags;
+        __le64 data_len;
+};
+struct log_writes_c {
+        struct dm_dev *dev;
+        struct dm_dev *logdev;
+        u64 logged_entries;
+        u32 sectorsize;
+        atomic_t io_blocks;
+        atomic_t pending_blocks;
+        sector_t next_sector;
+        sector_t end_sector;
+        bool logging_enabled;
+        bool device_supports_discard;
+        spinlock_t blocks_lock;
+        struct list_head unflushed_blocks;
+        struct list_head logging_blocks;
+        wait_queue_head_t wait;
+        struct task_struct *log_kthread;
+};
+struct pending_block {
+        int vec_cnt;
+        u64 flags;
+        sector_t sector;
+        sector_t nr_sectors;
+        char *data;
+        u32 datalen;
+        struct list_head list;
+        struct bio_vec vecs[0];
+};
+struct per_bio_data {
+        struct pending_block *block;
+};
+static void put_pending_block(struct log_writes_c *lc)
+{
+        if (atomic_dec_and_test(&lc->pending_blocks)) {
+                smp_mb__after_atomic();
+                if (waitqueue_active(&lc->wait))
+                        wake_up(&lc->wait);
+        }
+}
+static void put_io_block(struct log_writes_c *lc)
+{
+        if (atomic_dec_and_test(&lc->io_blocks)) {
+                smp_mb__after_atomic();
+                if (waitqueue_active(&lc->wait))
+                        wake_up(&lc->wait);
+        }
+}
+static void log_end_io(struct bio *bio, int err)
+{
+        struct log_writes_c *lc = bio->bi_private;
+        struct bio_vec *bvec;
+        int i;
+        if (err) {
+                unsigned long flags;
+                DMERR("Error writing log block, error=%d", err);
+                spin_lock_irqsave(&lc->blocks_lock, flags);
+                lc->logging_enabled = false;
+                spin_unlock_irqrestore(&lc->blocks_lock, flags);
+        }
+        bio_for_each_segment_all(bvec, bio, i)
+                __free_page(bvec->bv_page);
+        put_io_block(lc);
+        bio_put(bio);
+}
+/*
+ * Meant to be called if there is an error, it will free all the pages
+ * associated with the block.
+ */
+static void free_pending_block(struct log_writes_c *lc,
+                               struct pending_block *block)
+{
+        int i;
+        for (i = 0; i < block->vec_cnt; i++) {
+                if (block->vecs[i].bv_page)
+                        __free_page(block->vecs[i].bv_page);
+        }
+        kfree(block->data);
+        kfree(block);
+        put_pending_block(lc);
+}
+static int write_metadata(struct log_writes_c *lc, void *entry,
+                          size_t entrylen, void *data, size_t datalen,
+                          sector_t sector)
+{
+        struct bio *bio;
+        struct page *page;
+        void *ptr;
+        size_t ret;
+        bio = bio_alloc(GFP_KERNEL, 1);
+        if (!bio) {
+                DMERR("Couldn't alloc log bio");
+                goto error;
+        }
+        bio->bi_iter.bi_size = 0;
+        bio->bi_iter.bi_sector = sector;
+        bio->bi_bdev = lc->logdev->bdev;
+        bio->bi_end_io = log_end_io;
+        bio->bi_private = lc;
+        set_bit(BIO_UPTODATE, &bio->bi_flags);
+        page = alloc_page(GFP_KERNEL);
+        if (!page) {
+                DMERR("Couldn't alloc log page");
+                bio_put(bio);
+                goto error;
+        }
+        ptr = kmap_atomic(page);
+        memcpy(ptr, entry, entrylen);
+        if (datalen)
+                memcpy(ptr + entrylen, data, datalen);
+        memset(ptr + entrylen + datalen, 0,
+               lc->sectorsize - entrylen - datalen);
+        kunmap_atomic(ptr);
+        ret = bio_add_page(bio, page, lc->sectorsize, 0);
+        if (ret != lc->sectorsize) {
+                DMERR("Couldn't add page to the log block");
+                goto error_bio;
+        }
+        submit_bio(WRITE, bio);
+        return 0;
+error_bio:
+        bio_put(bio);
+        __free_page(page);
+error:
+        put_io_block(lc);
+        return -1;
+}
+static int log_one_block(struct log_writes_c *lc,
+                         struct pending_block *block, sector_t sector)
+{
+        struct bio *bio;
+        struct log_write_entry entry;
+        size_t ret;
+        int i;
+        entry.sector = cpu_to_le64(block->sector);
+        entry.nr_sectors = cpu_to_le64(block->nr_sectors);
+        entry.flags = cpu_to_le64(block->flags);
+        entry.data_len = cpu_to_le64(block->datalen);
+        if (write_metadata(lc, &entry, sizeof(entry), block->data,
+                           block->datalen, sector)) {
+                free_pending_block(lc, block);
+                return -1;
+        }
+        if (!block->vec_cnt)
+                goto out;
+        sector++;
+        bio = bio_alloc(GFP_KERNEL, block->vec_cnt);
+        if (!bio) {
+                DMERR("Couldn't alloc log bio");
+                goto error;
+        }
+        atomic_inc(&lc->io_blocks);
+        bio->bi_iter.bi_size = 0;
+        bio->bi_iter.bi_sector = sector;
+        bio->bi_bdev = lc->logdev->bdev;
+        bio->bi_end_io = log_end_io;
+        bio->bi_private = lc;
+        set_bit(BIO_UPTODATE, &bio->bi_flags);
+        for (i = 0; i < block->vec_cnt; i++) {
+                /*
+                 * The page offset is always 0 because we allocate a new page
+                 * for every bvec in the original bio for simplicity sake.
+                 */
+                ret = bio_add_page(bio, block->vecs[i].bv_page,
+                                   block->vecs[i].bv_len, 0);
+                if (ret != block->vecs[i].bv_len) {
+                        atomic_inc(&lc->io_blocks);
+                        submit_bio(WRITE, bio);
+                        bio = bio_alloc(GFP_KERNEL, block->vec_cnt - i);
+                        if (!bio) {
+                                DMERR("Couldn't alloc log bio");
+                                goto error;
+                        }
+                        bio->bi_iter.bi_size = 0;
+                        bio->bi_iter.bi_sector = sector;
+                        bio->bi_bdev = lc->logdev->bdev;
+                        bio->bi_end_io = log_end_io;
+                        bio->bi_private = lc;
+                        set_bit(BIO_UPTODATE, &bio->bi_flags);
+                        ret = bio_add_page(bio, block->vecs[i].bv_page,
+                                           block->vecs[i].bv_len, 0);
+                        if (ret != block->vecs[i].bv_len) {
+                                DMERR("Couldn't add page on new bio?");
+                                bio_put(bio);
+                                goto error;
+                        }
+                }
+                sector += block->vecs[i].bv_len >> SECTOR_SHIFT;
+        }
+        submit_bio(WRITE, bio);
+out:
+        kfree(block->data);
+        kfree(block);
+        put_pending_block(lc);
+        return 0;
+error:
+        free_pending_block(lc, block);
+        put_io_block(lc);
+        return -1;
+}
+static int log_super(struct log_writes_c *lc)
+{
+        struct log_write_super super;
+        super.magic = cpu_to_le64(WRITE_LOG_MAGIC);
+        super.version = cpu_to_le64(WRITE_LOG_VERSION);
+        super.nr_entries = cpu_to_le64(lc->logged_entries);
+        super.sectorsize = cpu_to_le32(lc->sectorsize);
+        if (write_metadata(lc, &super, sizeof(super), NULL, 0, 0)) {
+                DMERR("Couldn't write super");
+                return -1;
+        }
+        return 0;
+}
+static inline sector_t logdev_last_sector(struct log_writes_c *lc)
+{
+        return i_size_read(lc->logdev->bdev->bd_inode) >> SECTOR_SHIFT;
+}
+static int log_writes_kthread(void *arg)
+{
+        struct log_writes_c *lc = (struct log_writes_c *)arg;
+        sector_t sector = 0;
+        while (!kthread_should_stop()) {
+                bool super = false;
+                bool logging_enabled;
+                struct pending_block *block = NULL;
+                int ret;
+                spin_lock_irq(&lc->blocks_lock);
+                if (!list_empty(&lc->logging_blocks)) {
+                        block = list_first_entry(&lc->logging_blocks,
+                                                 struct pending_block, list);
+                        list_del_init(&block->list);
+                        if (!lc->logging_enabled)
+                                goto next;
+                        sector = lc->next_sector;
+                        if (block->flags & LOG_DISCARD_FLAG)
+                                lc->next_sector++;
+                        else
+                                lc->next_sector += block->nr_sectors + 1;
+                        /*
+                         * Apparently the size of the device may not be known
+                         * right away, so handle this properly.
+                         */
+                        if (!lc->end_sector)
+                                lc->end_sector = logdev_last_sector(lc);
+                        if (lc->end_sector &&
+                            lc->next_sector >= lc->end_sector) {
+                                DMERR("Ran out of space on the logdev");
+                                lc->logging_enabled = false;
+                                goto next;
+                        }
+                        lc->logged_entries++;
+                        atomic_inc(&lc->io_blocks);
+                        super = (block->flags & (LOG_FUA_FLAG | LOG_MARK_FLAG));
+                        if (super)
+                                atomic_inc(&lc->io_blocks);
+                }
+next:
+                logging_enabled = lc->logging_enabled;
+                spin_unlock_irq(&lc->blocks_lock);
+                if (block) {
+                        if (logging_enabled) {
+                                ret = log_one_block(lc, block, sector);
+                                if (!ret && super)
+                                        ret = log_super(lc);
+                                if (ret) {
+                                        spin_lock_irq(&lc->blocks_lock);
+                                        lc->logging_enabled = false;
+                                        spin_unlock_irq(&lc->blocks_lock);
+                                }
+                        } else
+                                free_pending_block(lc, block);
+                        continue;
+                }
+                if (!try_to_freeze()) {
+                        set_current_state(TASK_INTERRUPTIBLE);
+                        if (!kthread_should_stop() &&
+                            !atomic_read(&lc->pending_blocks))
+                                schedule();
+                        __set_current_state(TASK_RUNNING);
+                }
+        }
+        return 0;
+}
+/*
+ * Construct a log-writes mapping:
+ * log-writes <dev_path> <log_dev_path>
+ */
+static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+        struct log_writes_c *lc;
+        struct dm_arg_set as;
+        const char *devname, *logdevname;
+        as.argc = argc;
+        as.argv = argv;
+        if (argc < 2) {
+                ti->error = "Invalid argument count";
+                return -EINVAL;
+        }
+        lc = kzalloc(sizeof(struct log_writes_c), GFP_KERNEL);
+        if (!lc) {
+                ti->error = "Cannot allocate context";
+                return -ENOMEM;
+        }
+        spin_lock_init(&lc->blocks_lock);
+        INIT_LIST_HEAD(&lc->unflushed_blocks);
+        INIT_LIST_HEAD(&lc->logging_blocks);
+        init_waitqueue_head(&lc->wait);
+        lc->sectorsize = 1 << SECTOR_SHIFT;
+        atomic_set(&lc->io_blocks, 0);
+        atomic_set(&lc->pending_blocks, 0);
+        devname = dm_shift_arg(&as);
+        if (dm_get_device(ti, devname, dm_table_get_mode(ti->table), &lc->dev)) {
+                ti->error = "Device lookup failed";
+                goto bad;
+        }
+        logdevname = dm_shift_arg(&as);
+        if (dm_get_device(ti, logdevname, dm_table_get_mode(ti->table), &lc->logdev)) {
+                ti->error = "Log device lookup failed";
+                dm_put_device(ti, lc->dev);
+                goto bad;
+        }
+        lc->log_kthread = kthread_run(log_writes_kthread, lc, "log-write");
+        if (!lc->log_kthread) {
+                ti->error = "Couldn't alloc kthread";
+                dm_put_device(ti, lc->dev);
+                dm_put_device(ti, lc->logdev);
+                goto bad;
+        }
+        /* We put the super at sector 0, start logging at sector 1 */
+        lc->next_sector = 1;
+        lc->logging_enabled = true;
+        lc->end_sector = logdev_last_sector(lc);
+        lc->device_supports_discard = true;
+        ti->num_flush_bios = 1;
+        ti->flush_supported = true;
+        ti->num_discard_bios = 1;
+        ti->discards_supported = true;
+        ti->per_bio_data_size = sizeof(struct per_bio_data);
+        ti->private = lc;
+        return 0;
+bad:
+        kfree(lc);
+        return -EINVAL;
+}
+static int log_mark(struct log_writes_c *lc, char *data)
+{
+        struct pending_block *block;
+        size_t maxsize = lc->sectorsize - sizeof(struct log_write_entry);
+        block = kzalloc(sizeof(struct pending_block), GFP_KERNEL);
+        if (!block) {
+                DMERR("Error allocating pending block");
+                return -ENOMEM;
+        }
+        block->data = kstrndup(data, maxsize, GFP_KERNEL);
+        if (!block->data) {
+                DMERR("Error copying mark data");
+                kfree(block);
+                return -ENOMEM;
+        }
+        atomic_inc(&lc->pending_blocks);
+        block->datalen = strlen(block->data);
+        block->flags |= LOG_MARK_FLAG;
+        spin_lock_irq(&lc->blocks_lock);
+        list_add_tail(&block->list, &lc->logging_blocks);
+        spin_unlock_irq(&lc->blocks_lock);
+        wake_up_process(lc->log_kthread);
+        return 0;
+}
+static void log_writes_dtr(struct dm_target *ti)
+{
+        struct log_writes_c *lc = ti->private;
+        spin_lock_irq(&lc->blocks_lock);
+        list_splice_init(&lc->unflushed_blocks, &lc->logging_blocks);
+        spin_unlock_irq(&lc->blocks_lock);
+        /*
+         * This is just nice to have since it'll update the super to include the
+         * unflushed blocks, if it fails we don't really care.
+         */
+        log_mark(lc, "dm-log-writes-end");
+        wake_up_process(lc->log_kthread);
+        wait_event(lc->wait, !atomic_read(&lc->io_blocks) &&
+                   !atomic_read(&lc->pending_blocks));
+        kthread_stop(lc->log_kthread);
+        WARN_ON(!list_empty(&lc->logging_blocks));
+        WARN_ON(!list_empty(&lc->unflushed_blocks));
+        dm_put_device(ti, lc->dev);
+        dm_put_device(ti, lc->logdev);
+        kfree(lc);
+}
+static void normal_map_bio(struct dm_target *ti, struct bio *bio)
+{
+        struct log_writes_c *lc = ti->private;
+        bio->bi_bdev = lc->dev->bdev;
+}
+static int log_writes_map(struct dm_target *ti, struct bio *bio)
+{
+        struct log_writes_c *lc = ti->private;
+        struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
+        struct pending_block *block;
+        struct bvec_iter iter;
+        struct bio_vec bv;
+        size_t alloc_size;
+        int i = 0;
+        bool flush_bio = (bio->bi_rw & REQ_FLUSH);
+        bool fua_bio = (bio->bi_rw & REQ_FUA);
+        bool discard_bio = (bio->bi_rw & REQ_DISCARD);
+        pb->block = NULL;
+        /* Don't bother doing anything if logging has been disabled */
+        if (!lc->logging_enabled)
+                goto map_bio;
+        /*
+         * Map reads as normal.
+         */
+        if (bio_data_dir(bio) == READ)
+                goto map_bio;
+        /* No sectors and not a flush?  Don't care */
+        if (!bio_sectors(bio) && !flush_bio)
+                goto map_bio;
+        /*
+         * Discards will have bi_size set but there's no actual data, so just
+         * allocate the size of the pending block.
+         */
+        if (discard_bio)
+                alloc_size = sizeof(struct pending_block);
+        else
+                alloc_size = sizeof(struct pending_block) + sizeof(struct bio_vec) * bio_segments(bio);
+        block = kzalloc(alloc_size, GFP_NOIO);
+        if (!block) {
+                DMERR("Error allocating pending block");
+                spin_lock_irq(&lc->blocks_lock);
+                lc->logging_enabled = false;
+                spin_unlock_irq(&lc->blocks_lock);
+                return -ENOMEM;
+        }
+        INIT_LIST_HEAD(&block->list);
+        pb->block = block;
+        atomic_inc(&lc->pending_blocks);
+        if (flush_bio)
+                block->flags |= LOG_FLUSH_FLAG;
+        if (fua_bio)
+                block->flags |= LOG_FUA_FLAG;
+        if (discard_bio)
+                block->flags |= LOG_DISCARD_FLAG;
+        block->sector = bio->bi_iter.bi_sector;
+        block->nr_sectors = bio_sectors(bio);
+        /* We don't need the data, just submit */
+        if (discard_bio) {
+                WARN_ON(flush_bio || fua_bio);
+                if (lc->device_supports_discard)
+                        goto map_bio;
+                bio_endio(bio, 0);
+                return DM_MAPIO_SUBMITTED;
+        }
+        /* Flush bio, splice the unflushed blocks onto this list and submit */
+        if (flush_bio && !bio_sectors(bio)) {
+                spin_lock_irq(&lc->blocks_lock);
+                list_splice_init(&lc->unflushed_blocks, &block->list);
+                spin_unlock_irq(&lc->blocks_lock);
+                goto map_bio;
+        }
+        /*
+         * We will write this bio somewhere else way later so we need to copy
+         * the actual contents into new pages so we know the data will always be
+         * there.
+         *
+         * We do this because this could be a bio from O_DIRECT in which case we
+         * can't just hold onto the page until some later point, we have to
+         * manually copy the contents.
+         */
+        bio_for_each_segment(bv, bio, iter) {
+                struct page *page;
+                void *src, *dst;
+                page = alloc_page(GFP_NOIO);
+                if (!page) {
+                        DMERR("Error allocing page");
+                        free_pending_block(lc, block);
+                        spin_lock_irq(&lc->blocks_lock);
+                        lc->logging_enabled = false;
+                        spin_unlock_irq(&lc->blocks_lock);
+                        return -ENOMEM;
+                }
+                src = kmap_atomic(bv.bv_page);
+                dst = kmap_atomic(page);
+                memcpy(dst, src + bv.bv_offset, bv.bv_len);
+                kunmap_atomic(dst);
+                kunmap_atomic(src);
+                block->vecs[i].bv_page = page;
+                block->vecs[i].bv_len = bv.bv_len;
+                block->vec_cnt++;
+                i++;
+        }
+        /* Had a flush with data in it, weird */
+        if (flush_bio) {
+                spin_lock_irq(&lc->blocks_lock);
+                list_splice_init(&lc->unflushed_blocks, &block->list);
+                spin_unlock_irq(&lc->blocks_lock);
+        }
+map_bio:
+        normal_map_bio(ti, bio);
+        return DM_MAPIO_REMAPPED;
+}
+static int normal_end_io(struct dm_target *ti, struct bio *bio, int error)
+{
+        struct log_writes_c *lc = ti->private;
+        struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
+        if (bio_data_dir(bio) == WRITE && pb->block) {
+                struct pending_block *block = pb->block;
+                unsigned long flags;
+                spin_lock_irqsave(&lc->blocks_lock, flags);
+                if (block->flags & LOG_FLUSH_FLAG) {
+                        list_splice_tail_init(&block->list, &lc->logging_blocks);
+                        list_add_tail(&block->list, &lc->logging_blocks);
+                        wake_up_process(lc->log_kthread);
+                } else if (block->flags & LOG_FUA_FLAG) {
+                        list_add_tail(&block->list, &lc->logging_blocks);
+                        wake_up_process(lc->log_kthread);
+                } else
+                        list_add_tail(&block->list, &lc->unflushed_blocks);
+                spin_unlock_irqrestore(&lc->blocks_lock, flags);
+        }
+        return error;
+}
+/*
+ * INFO format: <logged entries> <highest allocated sector>
+ */
+static void log_writes_status(struct dm_target *ti, status_type_t type,
+                              unsigned status_flags, char *result,
+                              unsigned maxlen)
+{
+        unsigned sz = 0;
+        struct log_writes_c *lc = ti->private;
+        switch (type) {
+        case STATUSTYPE_INFO:
+                DMEMIT("%llu %llu", lc->logged_entries,
+                       (unsigned long long)lc->next_sector - 1);
+                if (!lc->logging_enabled)
+                        DMEMIT(" logging_disabled");
+                break;
+        case STATUSTYPE_TABLE:
+                DMEMIT("%s %s", lc->dev->name, lc->logdev->name);
+                break;
+        }
+}
+static int log_writes_ioctl(struct dm_target *ti, unsigned int cmd,
+                            unsigned long arg)
+{
+        struct log_writes_c *lc = ti->private;
+        struct dm_dev *dev = lc->dev;
+        int r = 0;
+        /*
+         * Only pass ioctls through if the device sizes match exactly.
+         */
+        if (ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
+                r = scsi_verify_blk_ioctl(NULL, cmd);
+        return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg);
+}
+static int log_writes_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+                            struct bio_vec *biovec, int max_size)
+{
+        struct log_writes_c *lc = ti->private;
+        struct request_queue *q = bdev_get_queue(lc->dev->bdev);
+        if (!q->merge_bvec_fn)
+                return max_size;
+        bvm->bi_bdev = lc->dev->bdev;
+        bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector);
+        return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+static int log_writes_iterate_devices(struct dm_target *ti,
+                                      iterate_devices_callout_fn fn,
+                                      void *data)
+{
+        struct log_writes_c *lc = ti->private;
+        return fn(ti, lc->dev, 0, ti->len, data);
+}
+/*
+ * Messages supported:
+ *   mark <mark data> - specify the marked data.
+ */
+static int log_writes_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+        int r = -EINVAL;
+        struct log_writes_c *lc = ti->private;
+        if (argc != 2) {
+                DMWARN("Invalid log-writes message arguments, expect 2 arguments, got %d", argc);
+                return r;
+        }
+        if (!strcasecmp(argv[0], "mark"))
+                r = log_mark(lc, argv[1]);
+        else
+                DMWARN("Unrecognised log writes target message received: %s", argv[0]);
+        return r;
+}
+static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+        struct log_writes_c *lc = ti->private;
+        struct request_queue *q = bdev_get_queue(lc->dev->bdev);
+        if (!q || !blk_queue_discard(q)) {
+                lc->device_supports_discard = false;
+                limits->discard_granularity = 1 << SECTOR_SHIFT;
+                limits->max_discard_sectors = (UINT_MAX >> SECTOR_SHIFT);
+        }
+}
+static struct target_type log_writes_target = {
+        .name   = "log-writes",
+        .version = {1, 0, 0},
+        .module = THIS_MODULE,
+        .ctr    = log_writes_ctr,
+        .dtr    = log_writes_dtr,
+        .map    = log_writes_map,
+        .end_io = normal_end_io,
+        .status = log_writes_status,
+        .ioctl  = log_writes_ioctl,
+        .merge  = log_writes_merge,
+        .message = log_writes_message,
+        .iterate_devices = log_writes_iterate_devices,
+        .io_hints = log_writes_io_hints,
+};
+static int __init dm_log_writes_init(void)
+{
+        int r = dm_register_target(&log_writes_target);
+        if (r < 0)
+                DMERR("register failed %d", r);
+        return r;
+}
+static void __exit dm_log_writes_exit(void)
+{
+        dm_unregister_target(&log_writes_target);
+}
+module_init(dm_log_writes_init);
+module_exit(dm_log_writes_exit);
+MODULE_DESCRIPTION(DM_NAME " log writes target");
+MODULE_AUTHOR("Josef Bacik <jbacik@fb.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index d376dc87716e..63953477a07c 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -428,7 +428,7 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
        } else {
                /* blk-mq request-based interface */
                *__clone = blk_get_request(bdev_get_queue(bdev),
-                                           rq_data_dir(rq), GFP_KERNEL);
+                                           rq_data_dir(rq), GFP_ATOMIC);
                if (IS_ERR(*__clone))
                        /* ENOMEM, requeue */
                        return r;
@@ -1627,7 +1627,7 @@ static int __pgpath_busy(struct pgpath *pgpath)
 {
        struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
-        return dm_underlying_device_busy(q);
+        return blk_lld_busy(q);
 }
 /*
@@ -1703,7 +1703,7 @@ out:
 *---------------------------------------------------------------*/
 static struct target_type multipath_target = {
        .name = "multipath",
-        .version = {1, 8, 0},
+        .version = {1, 9, 0},
        .module = THIS_MODULE,
        .ctr = multipath_ctr,
        .dtr = multipath_dtr,
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index c62c5ab6aed5..7e818f5f1dc4 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -11,7 +11,7 @@
 struct dm_sysfs_attr {
        struct attribute attr;
        ssize_t (*show)(struct mapped_device *, char *);
-        ssize_t (*store)(struct mapped_device *, char *);
+        ssize_t (*store)(struct mapped_device *, const char *, size_t count);
 };
 #define DM_ATTR_RO(_name) \
@@ -39,6 +39,31 @@ static ssize_t dm_attr_show(struct kobject *kobj, struct attribute *attr,
        return ret;
 }
+#define DM_ATTR_RW(_name) \
+struct dm_sysfs_attr dm_attr_##_name = \
+        __ATTR(_name, S_IRUGO | S_IWUSR, dm_attr_##_name##_show, dm_attr_##_name##_store)
+static ssize_t dm_attr_store(struct kobject *kobj, struct attribute *attr,
+                             const char *page, size_t count)
+{
+        struct dm_sysfs_attr *dm_attr;
+        struct mapped_device *md;
+        ssize_t ret;
+        dm_attr = container_of(attr, struct dm_sysfs_attr, attr);
+        if (!dm_attr->store)
+                return -EIO;
+        md = dm_get_from_kobject(kobj);
+        if (!md)
+                return -EINVAL;
+        ret = dm_attr->store(md, page, count);
+        dm_put(md);
+        return ret;
+}
 static ssize_t dm_attr_name_show(struct mapped_device *md, char *buf)
 {
        if (dm_copy_name_and_uuid(md, buf, NULL))
@@ -64,25 +89,33 @@ static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf)
        return strlen(buf);
 }
+static ssize_t dm_attr_use_blk_mq_show(struct mapped_device *md, char *buf)
+{
+        sprintf(buf, "%d\n", dm_use_blk_mq(md));
+        return strlen(buf);
+}
 static DM_ATTR_RO(name);
 static DM_ATTR_RO(uuid);
 static DM_ATTR_RO(suspended);
+static DM_ATTR_RO(use_blk_mq);
+static DM_ATTR_RW(rq_based_seq_io_merge_deadline);
 static struct attribute *dm_attrs[] = {
        &dm_attr_name.attr,
        &dm_attr_uuid.attr,
        &dm_attr_suspended.attr,
+        &dm_attr_use_blk_mq.attr,
+        &dm_attr_rq_based_seq_io_merge_deadline.attr,
        NULL,
 };
 static const struct sysfs_ops dm_sysfs_ops = {
        .show   = dm_attr_show,
+        .store  = dm_attr_store,
 };
-/*
- * dm kobject is embedded in mapped_device structure
- * no need to define release function here
- */
 static struct kobj_type dm_ktype = {
        .sysfs_ops      = &dm_sysfs_ops,
        .default_attrs  = dm_attrs,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 6554d9148927..d9b00b8565c6 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -18,6 +18,8 @@
 #include <linux/mutex.h>
 #include <linux/delay.h>
 #include <linux/atomic.h>
+#include <linux/blk-mq.h>
+#include <linux/mount.h>
 #define DM_MSG_PREFIX "table"
@@ -372,23 +374,18 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
        int r;
        dev_t uninitialized_var(dev);
        struct dm_dev_internal *dd;
-        unsigned int major, minor;
        struct dm_table *t = ti->table;
-        char dummy;
+        struct block_device *bdev;
        BUG_ON(!t);
-        if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) {
+        /* convert the path to a device */
-                /* Extract the major/minor numbers */
+        bdev = lookup_bdev(path);
-                dev = MKDEV(major, minor);
+        if (IS_ERR(bdev)) {
-                if (MAJOR(dev) != major || MINOR(dev) != minor)
+                dev = name_to_dev_t(path);
-                        return -EOVERFLOW;
+                if (!dev)
+                        return -ENODEV;
        } else {
-                /* convert the path to a device */
-                struct block_device *bdev = lookup_bdev(path);
-                if (IS_ERR(bdev))
-                        return PTR_ERR(bdev);
                dev = bdev->bd_dev;
                bdput(bdev);
        }
@@ -939,7 +936,7 @@ bool dm_table_mq_request_based(struct dm_table *t)
        return dm_table_get_type(t) == DM_TYPE_MQ_REQUEST_BASED;
 }
-static int dm_table_alloc_md_mempools(struct dm_table *t)
+static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md)
 {
        unsigned type = dm_table_get_type(t);
        unsigned per_bio_data_size = 0;
@@ -957,7 +954,7 @@ static int dm_table_alloc_md_mempools(struct dm_table *t)
                        per_bio_data_size = max(per_bio_data_size, tgt->per_bio_data_size);
                }
-        t->mempools = dm_alloc_md_mempools(type, t->integrity_supported, per_bio_data_size);
+        t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_bio_data_size);
        if (!t->mempools)
                return -ENOMEM;
@@ -1127,7 +1124,7 @@ int dm_table_complete(struct dm_table *t)
                return r;
        }
-        r = dm_table_alloc_md_mempools(t);
+        r = dm_table_alloc_md_mempools(t, t->md);
        if (r)
                DMERR("unable to allocate mempools");
@@ -1339,14 +1336,14 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
                        continue;
                if (ti->flush_supported)
-                        return 1;
+                        return true;
                if (ti->type->iterate_devices &&
                    ti->type->iterate_devices(ti, device_flush_capable, &flush))
-                        return 1;
+                        return true;
        }
-        return 0;
+        return false;
 }
 static bool dm_table_discard_zeroes_data(struct dm_table *t)
@@ -1359,10 +1356,10 @@ static bool dm_table_discard_zeroes_data(struct dm_table *t)
                ti = dm_table_get_target(t, i++);
                if (ti->discard_zeroes_data_unsupported)
-                        return 0;
+                        return false;
        }
-        return 1;
+        return true;
 }
 static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev,
@@ -1408,10 +1405,10 @@ static bool dm_table_all_devices_attribute(struct dm_table *t,
                if (!ti->type->iterate_devices ||
                    !ti->type->iterate_devices(ti, func, NULL))
-                        return 0;
+                        return false;
        }
-        return 1;
+        return true;
 }
 static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *dev,
@@ -1468,14 +1465,14 @@ static bool dm_table_supports_discards(struct dm_table *t)
                        continue;
                if (ti->discards_supported)
-                        return 1;
+                        return true;
                if (ti->type->iterate_devices &&
                    ti->type->iterate_devices(ti, device_discard_capable, NULL))
-                        return 1;
+                        return true;
        }
-        return 0;
+        return false;
 }
 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
@@ -1677,20 +1674,6 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
        return r;
 }
-int dm_table_any_busy_target(struct dm_table *t)
-{
-        unsigned i;
-        struct dm_target *ti;
-        for (i = 0; i < t->num_targets; i++) {
-                ti = t->targets + i;
-                if (ti->type->busy && ti->type->busy(ti))
-                        return 1;
-        }
-        return 0;
-}
 struct mapped_device *dm_table_get_md(struct dm_table *t)
 {
        return t->md;
@@ -1709,9 +1692,13 @@ void dm_table_run_md_queue_async(struct dm_table *t)
        md = dm_table_get_md(t);
        queue = dm_get_md_queue(md);
        if (queue) {
-                spin_lock_irqsave(queue->queue_lock, flags);
+                if (queue->mq_ops)
-                blk_run_queue_async(queue);
+                        blk_mq_run_hw_queues(queue, true);
-                spin_unlock_irqrestore(queue->queue_lock, flags);
+                else {
+                        spin_lock_irqsave(queue->queue_lock, flags);
+                        blk_run_queue_async(queue);
+                        spin_unlock_irqrestore(queue->queue_lock, flags);
+                }
        }
 }
 EXPORT_SYMBOL(dm_table_run_md_queue_async);
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 7a7bab8947ae..66616db33e6f 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -18,20 +18,39 @@
 #include <linux/module.h>
 #include <linux/device-mapper.h>
+#include <linux/reboot.h>
 #include <crypto/hash.h>
 #define DM_MSG_PREFIX                   "verity"
+#define DM_VERITY_ENV_LENGTH            42
+#define DM_VERITY_ENV_VAR_NAME          "DM_VERITY_ERR_BLOCK_NR"
 #define DM_VERITY_IO_VEC_INLINE         16
 #define DM_VERITY_MEMPOOL_SIZE          4
 #define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144
 #define DM_VERITY_MAX_LEVELS            63
+#define DM_VERITY_MAX_CORRUPTED_ERRS    100
+#define DM_VERITY_OPT_LOGGING           "ignore_corruption"
+#define DM_VERITY_OPT_RESTART           "restart_on_corruption"
 static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
 module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR);
+enum verity_mode {
+        DM_VERITY_MODE_EIO,
+        DM_VERITY_MODE_LOGGING,
+        DM_VERITY_MODE_RESTART
+};
+enum verity_block_type {
+        DM_VERITY_BLOCK_TYPE_DATA,
+        DM_VERITY_BLOCK_TYPE_METADATA
+};
 struct dm_verity {
        struct dm_dev *data_dev;
        struct dm_dev *hash_dev;
@@ -54,6 +73,8 @@ struct dm_verity {
        unsigned digest_size;   /* digest size for the current hash algorithm */
        unsigned shash_descsize;/* the size of temporary space for crypto */
        int hash_failed;        /* set to 1 if hash of any block failed */
+        enum verity_mode mode;  /* mode for handling verification errors */
+        unsigned corrupted_errs;/* Number of errors for corrupted blocks */
        mempool_t *vec_mempool; /* mempool of bio vector */
@@ -175,6 +196,57 @@ static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level,
 }
 /*
+ * Handle verification errors.
+ */
+static int verity_handle_err(struct dm_verity *v, enum verity_block_type type,
+                             unsigned long long block)
+{
+        char verity_env[DM_VERITY_ENV_LENGTH];
+        char *envp[] = { verity_env, NULL };
+        const char *type_str = "";
+        struct mapped_device *md = dm_table_get_md(v->ti->table);
+        /* Corruption should be visible in device status in all modes */
+        v->hash_failed = 1;
+        if (v->corrupted_errs >= DM_VERITY_MAX_CORRUPTED_ERRS)
+                goto out;
+        v->corrupted_errs++;
+        switch (type) {
+        case DM_VERITY_BLOCK_TYPE_DATA:
+                type_str = "data";
+                break;
+        case DM_VERITY_BLOCK_TYPE_METADATA:
+                type_str = "metadata";
+                break;
+        default:
+                BUG();
+        }
+        DMERR("%s: %s block %llu is corrupted", v->data_dev->name, type_str,
+                block);
+        if (v->corrupted_errs == DM_VERITY_MAX_CORRUPTED_ERRS)
+                DMERR("%s: reached maximum errors", v->data_dev->name);
+        snprintf(verity_env, DM_VERITY_ENV_LENGTH, "%s=%d,%llu",
+                DM_VERITY_ENV_VAR_NAME, type, block);
+        kobject_uevent_env(&disk_to_dev(dm_disk(md))->kobj, KOBJ_CHANGE, envp);
+out:
+        if (v->mode == DM_VERITY_MODE_LOGGING)
+                return 0;
+        if (v->mode == DM_VERITY_MODE_RESTART)
+                kernel_restart("dm-verity device corrupted");
+        return 1;
+}
+/*
 * Verify hash of a metadata block pertaining to the specified data block
 * ("block" argument) at a specified level ("level" argument).
 *
@@ -251,11 +323,11 @@ static int verity_verify_level(struct dm_verity_io *io, sector_t block,
                        goto release_ret_r;
                }
                if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
-                        DMERR_LIMIT("metadata block %llu is corrupted",
+                        if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_METADATA,
-                                (unsigned long long)hash_block);
+                                              hash_block)) {
-                        v->hash_failed = 1;
+                                r = -EIO;
-                        r = -EIO;
+                                goto release_ret_r;
-                        goto release_ret_r;
+                        }
                } else
                        aux->hash_verified = 1;
        }
@@ -367,10 +439,9 @@ test_block_hash:
                        return r;
                }
                if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
-                        DMERR_LIMIT("data block %llu is corrupted",
+                        if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
-                                (unsigned long long)(io->block + b));
+                                              io->block + b))
-                        v->hash_failed = 1;
+                                return -EIO;
-                        return -EIO;
                }
        }
@@ -546,6 +617,19 @@ static void verity_status(struct dm_target *ti, status_type_t type,
                else
                        for (x = 0; x < v->salt_size; x++)
                                DMEMIT("%02x", v->salt[x]);
+                if (v->mode != DM_VERITY_MODE_EIO) {
+                        DMEMIT(" 1 ");
+                        switch (v->mode) {
+                        case DM_VERITY_MODE_LOGGING:
+                                DMEMIT(DM_VERITY_OPT_LOGGING);
+                                break;
+                        case DM_VERITY_MODE_RESTART:
+                                DMEMIT(DM_VERITY_OPT_RESTART);
+                                break;
+                        default:
+                                BUG();
+                        }
+                }
                break;
        }
 }
@@ -647,13 +731,19 @@ static void verity_dtr(struct dm_target *ti)
 static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
        struct dm_verity *v;
-        unsigned num;
+        struct dm_arg_set as;
+        const char *opt_string;
+        unsigned int num, opt_params;
        unsigned long long num_ll;
        int r;
        int i;
        sector_t hash_position;
        char dummy;
+        static struct dm_arg _args[] = {
+                {0, 1, "Invalid number of feature args"},
+        };
        v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL);
        if (!v) {
                ti->error = "Cannot allocate verity structure";
@@ -668,8 +758,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
                goto bad;
        }
-        if (argc != 10) {
+        if (argc < 10) {
-                ti->error = "Invalid argument count: exactly 10 arguments required";
+                ti->error = "Not enough arguments";
                r = -EINVAL;
                goto bad;
        }
@@ -790,6 +880,39 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
                }
        }
+        argv += 10;
+        argc -= 10;
+        /* Optional parameters */
+        if (argc) {
+                as.argc = argc;
+                as.argv = argv;
+                r = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
+                if (r)
+                        goto bad;
+                while (opt_params) {
+                        opt_params--;
+                        opt_string = dm_shift_arg(&as);
+                        if (!opt_string) {
+                                ti->error = "Not enough feature arguments";
+                                r = -EINVAL;
+                                goto bad;
+                        }
+                        if (!strcasecmp(opt_string, DM_VERITY_OPT_LOGGING))
+                                v->mode = DM_VERITY_MODE_LOGGING;
+                        else if (!strcasecmp(opt_string, DM_VERITY_OPT_RESTART))
+                                v->mode = DM_VERITY_MODE_RESTART;
+                        else {
+                                ti->error = "Invalid feature arguments";
+                                r = -EINVAL;
+                                goto bad;
+                        }
+                }
+        }
        v->hash_per_block_bits =
                __fls((1 << v->hash_dev_block_bits) / v->digest_size);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8001fe9e3434..f8c7ca3e8947 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -21,6 +21,9 @@
 #include <linux/delay.h>
 #include <linux/wait.h>
 #include <linux/kthread.h>
+#include <linux/ktime.h>
+#include <linux/elevator.h> /* for rq_end_sector() */
+#include <linux/blk-mq.h>
 #include <trace/events/block.h>
@@ -216,8 +219,29 @@ struct mapped_device {
        struct kthread_worker kworker;
        struct task_struct *kworker_task;
+        /* for request-based merge heuristic in dm_request_fn() */
+        unsigned seq_rq_merge_deadline_usecs;
+        int last_rq_rw;
+        sector_t last_rq_pos;
+        ktime_t last_rq_start_time;
+        /* for blk-mq request-based DM support */
+        struct blk_mq_tag_set tag_set;
+        bool use_blk_mq;
 };
+#ifdef CONFIG_DM_MQ_DEFAULT
+static bool use_blk_mq = true;
+#else
+static bool use_blk_mq = false;
+#endif
+bool dm_use_blk_mq(struct mapped_device *md)
+{
+        return md->use_blk_mq;
+}
 /*
 * For mempools pre-allocation at the table loading time.
 */
@@ -250,35 +274,35 @@ static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
 */
 static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
-static unsigned __dm_get_reserved_ios(unsigned *reserved_ios,
+static unsigned __dm_get_module_param(unsigned *module_param,
                                      unsigned def, unsigned max)
 {
-        unsigned ios = ACCESS_ONCE(*reserved_ios);
+        unsigned param = ACCESS_ONCE(*module_param);
-        unsigned modified_ios = 0;
+        unsigned modified_param = 0;
-        if (!ios)
+        if (!param)
-                modified_ios = def;
+                modified_param = def;
-        else if (ios > max)
+        else if (param > max)
-                modified_ios = max;
+                modified_param = max;
-        if (modified_ios) {
+        if (modified_param) {
-                (void)cmpxchg(reserved_ios, ios, modified_ios);
+                (void)cmpxchg(module_param, param, modified_param);
-                ios = modified_ios;
+                param = modified_param;
        }
-        return ios;
+        return param;
 }
 unsigned dm_get_reserved_bio_based_ios(void)
 {
-        return __dm_get_reserved_ios(&reserved_bio_based_ios,
+        return __dm_get_module_param(&reserved_bio_based_ios,
                                     RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS);
 }
 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
 unsigned dm_get_reserved_rq_based_ios(void)
 {
-        return __dm_get_reserved_ios(&reserved_rq_based_ios,
+        return __dm_get_module_param(&reserved_rq_based_ios,
                                     RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS);
 }
 EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);
@@ -1017,6 +1041,11 @@ static void end_clone_bio(struct bio *clone, int error)
        blk_update_request(tio->orig, 0, nr_bytes);
 }
+static struct dm_rq_target_io *tio_from_request(struct request *rq)
+{
+        return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
+}
 /*
 * Don't touch any member of the md after calling this function because
 * the md may be freed in dm_put() at the end of this function.
@@ -1024,10 +1053,13 @@ static void end_clone_bio(struct bio *clone, int error)
 */
 static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
 {
+        int nr_requests_pending;
        atomic_dec(&md->pending[rw]);
        /* nudge anyone waiting on suspend queue */
-        if (!md_in_flight(md))
+        nr_requests_pending = md_in_flight(md);
+        if (!nr_requests_pending)
                wake_up(&md->wait);
        /*
@@ -1036,8 +1068,13 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
         * back into ->request_fn() could deadlock attempting to grab the
         * queue lock again.
         */
-        if (run_queue)
+        if (run_queue) {
-                blk_run_queue_async(md->queue);
+                if (md->queue->mq_ops)
+                        blk_mq_run_hw_queues(md->queue, true);
+                else if (!nr_requests_pending ||
+                         (nr_requests_pending >= md->queue->nr_congestion_on))
+                        blk_run_queue_async(md->queue);
+        }
        /*
         * dm_put() must be at the end of this function. See the comment above
@@ -1048,13 +1085,18 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
 static void free_rq_clone(struct request *clone)
 {
        struct dm_rq_target_io *tio = clone->end_io_data;
+        struct mapped_device *md = tio->md;
        blk_rq_unprep_clone(clone);
-        if (clone->q && clone->q->mq_ops)
+        if (clone->q->mq_ops)
                tio->ti->type->release_clone_rq(clone);
-        else
+        else if (!md->queue->mq_ops)
-                free_clone_request(tio->md, clone);
+                /* request_fn queue stacked on request_fn queue(s) */
-        free_rq_tio(tio);
+                free_clone_request(md, clone);
+        if (!md->queue->mq_ops)
+                free_rq_tio(tio);
 }
 /*
@@ -1083,17 +1125,22 @@ static void dm_end_request(struct request *clone, int error)
        }
        free_rq_clone(clone);
-        blk_end_request_all(rq, error);
+        if (!rq->q->mq_ops)
+                blk_end_request_all(rq, error);
+        else
+                blk_mq_end_request(rq, error);
        rq_completed(md, rw, true);
 }
 static void dm_unprep_request(struct request *rq)
 {
-        struct dm_rq_target_io *tio = rq->special;
+        struct dm_rq_target_io *tio = tio_from_request(rq);
        struct request *clone = tio->clone;
-        rq->special = NULL;
+        if (!rq->q->mq_ops) {
-        rq->cmd_flags &= ~REQ_DONTPREP;
+                rq->special = NULL;
+                rq->cmd_flags &= ~REQ_DONTPREP;
+        }
        if (clone)
                free_rq_clone(clone);
@@ -1102,18 +1149,29 @@ static void dm_unprep_request(struct request *rq)
 /*
 * Requeue the original request of a clone.
 */
-static void dm_requeue_unmapped_original_request(struct mapped_device *md,
+static void old_requeue_request(struct request *rq)
-                                                 struct request *rq)
 {
-        int rw = rq_data_dir(rq);
        struct request_queue *q = rq->q;
        unsigned long flags;
-        dm_unprep_request(rq);
        spin_lock_irqsave(q->queue_lock, flags);
        blk_requeue_request(q, rq);
        spin_unlock_irqrestore(q->queue_lock, flags);
+}
+static void dm_requeue_unmapped_original_request(struct mapped_device *md,
+                                                 struct request *rq)
+{
+        int rw = rq_data_dir(rq);
+        dm_unprep_request(rq);
+        if (!rq->q->mq_ops)
+                old_requeue_request(rq);
+        else {
+                blk_mq_requeue_request(rq);
+                blk_mq_kick_requeue_list(rq->q);
+        }
        rq_completed(md, rw, false);
 }
@@ -1125,35 +1183,44 @@ static void dm_requeue_unmapped_request(struct request *clone)
        dm_requeue_unmapped_original_request(tio->md, tio->orig);
 }
-static void __stop_queue(struct request_queue *q)
+static void old_stop_queue(struct request_queue *q)
-{
-        blk_stop_queue(q);
-}
-static void stop_queue(struct request_queue *q)
 {
        unsigned long flags;
+        if (blk_queue_stopped(q))
+                return;
        spin_lock_irqsave(q->queue_lock, flags);
-        __stop_queue(q);
+        blk_stop_queue(q);
        spin_unlock_irqrestore(q->queue_lock, flags);
 }
-static void __start_queue(struct request_queue *q)
+static void stop_queue(struct request_queue *q)
 {
-        if (blk_queue_stopped(q))
+        if (!q->mq_ops)
-                blk_start_queue(q);
+                old_stop_queue(q);
+        else
+                blk_mq_stop_hw_queues(q);
 }
-static void start_queue(struct request_queue *q)
+static void old_start_queue(struct request_queue *q)
 {
        unsigned long flags;
        spin_lock_irqsave(q->queue_lock, flags);
-        __start_queue(q);
+        if (blk_queue_stopped(q))
+                blk_start_queue(q);
        spin_unlock_irqrestore(q->queue_lock, flags);
 }
+static void start_queue(struct request_queue *q)
+{
+        if (!q->mq_ops)
+                old_start_queue(q);
+        else
+                blk_mq_start_stopped_hw_queues(q, true);
+}
 static void dm_done(struct request *clone, int error, bool mapped)
 {
        int r = error;
@@ -1192,13 +1259,20 @@ static void dm_done(struct request *clone, int error, bool mapped)
 static void dm_softirq_done(struct request *rq)
 {
        bool mapped = true;
-        struct dm_rq_target_io *tio = rq->special;
+        struct dm_rq_target_io *tio = tio_from_request(rq);
        struct request *clone = tio->clone;
+        int rw;
        if (!clone) {
-                blk_end_request_all(rq, tio->error);
+                rw = rq_data_dir(rq);
-                rq_completed(tio->md, rq_data_dir(rq), false);
+                if (!rq->q->mq_ops) {
-                free_rq_tio(tio);
+                        blk_end_request_all(rq, tio->error);
+                        rq_completed(tio->md, rw, false);
+                        free_rq_tio(tio);
+                } else {
+                        blk_mq_end_request(rq, tio->error);
+                        rq_completed(tio->md, rw, false);
+                }
                return;
        }
@@ -1214,7 +1288,7 @@ static void dm_softirq_done(struct request *rq)
 */
 static void dm_complete_request(struct request *rq, int error)
 {
-        struct dm_rq_target_io *tio = rq->special;
+        struct dm_rq_target_io *tio = tio_from_request(rq);
        tio->error = error;
        blk_complete_request(rq);
@@ -1233,7 +1307,7 @@ static void dm_kill_unmapped_request(struct request *rq, int error)
 }
 /*
- * Called with the clone's queue lock held
+ * Called with the clone's queue lock held (for non-blk-mq)
 */
 static void end_clone_request(struct request *clone, int error)
 {
@@ -1693,7 +1767,7 @@ out:
 * The request function that just remaps the bio built up by
 * dm_merge_bvec.
 */
-static void _dm_request(struct request_queue *q, struct bio *bio)
+static void dm_make_request(struct request_queue *q, struct bio *bio)
 {
        int rw = bio_data_dir(bio);
        struct mapped_device *md = q->queuedata;
@@ -1725,16 +1799,6 @@ int dm_request_based(struct mapped_device *md)
        return blk_queue_stackable(md->queue);
 }
-static void dm_request(struct request_queue *q, struct bio *bio)
-{
-        struct mapped_device *md = q->queuedata;
-        if (dm_request_based(md))
-                blk_queue_bio(q, bio);
-        else
-                _dm_request(q, bio);
-}
 static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
 {
        int r;
@@ -1787,15 +1851,25 @@ static int setup_clone(struct request *clone, struct request *rq,
 static struct request *clone_rq(struct request *rq, struct mapped_device *md,
                                struct dm_rq_target_io *tio, gfp_t gfp_mask)
 {
-        struct request *clone = alloc_clone_request(md, gfp_mask);
+        /*
+         * Do not allocate a clone if tio->clone was already set
+         * (see: dm_mq_queue_rq).
+         */
+        bool alloc_clone = !tio->clone;
+        struct request *clone;
-        if (!clone)
+        if (alloc_clone) {
-                return NULL;
+                clone = alloc_clone_request(md, gfp_mask);
+                if (!clone)
+                        return NULL;
+        } else
+                clone = tio->clone;
        blk_rq_init(NULL, clone);
        if (setup_clone(clone, rq, tio, gfp_mask)) {
                /* -ENOMEM */
-                free_clone_request(md, clone);
+                if (alloc_clone)
+                        free_clone_request(md, clone);
                return NULL;
        }
@@ -1804,6 +1878,19 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md,
 static void map_tio_request(struct kthread_work *work);
+static void init_tio(struct dm_rq_target_io *tio, struct request *rq,
+                     struct mapped_device *md)
+{
+        tio->md = md;
+        tio->ti = NULL;
+        tio->clone = NULL;
+        tio->orig = rq;
+        tio->error = 0;
+        memset(&tio->info, 0, sizeof(tio->info));
+        if (md->kworker_task)
+                init_kthread_work(&tio->work, map_tio_request);
+}
 static struct dm_rq_target_io *prep_tio(struct request *rq,
                                        struct mapped_device *md, gfp_t gfp_mask)
 {
@@ -1815,13 +1902,7 @@ static struct dm_rq_target_io *prep_tio(struct request *rq,
        if (!tio)
                return NULL;
-        tio->md = md;
+        init_tio(tio, rq, md);
-        tio->ti = NULL;
-        tio->clone = NULL;
-        tio->orig = rq;
-        tio->error = 0;
-        memset(&tio->info, 0, sizeof(tio->info));
-        init_kthread_work(&tio->work, map_tio_request);
        table = dm_get_live_table(md, &srcu_idx);
        if (!dm_table_mq_request_based(table)) {
@@ -1865,11 +1946,11 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
 * DM_MAPIO_REQUEUE : the original request needs to be requeued
 * < 0              : the request was completed due to failure
 */
-static int map_request(struct dm_target *ti, struct request *rq,
+static int map_request(struct dm_rq_target_io *tio, struct request *rq,
                       struct mapped_device *md)
 {
        int r;
-        struct dm_rq_target_io *tio = rq->special;
+        struct dm_target *ti = tio->ti;
        struct request *clone = NULL;
        if (tio->clone) {
@@ -1884,7 +1965,7 @@ static int map_request(struct dm_target *ti, struct request *rq,
                }
                if (IS_ERR(clone))
                        return DM_MAPIO_REQUEUE;
-                if (setup_clone(clone, rq, tio, GFP_KERNEL)) {
+                if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
                        /* -ENOMEM */
                        ti->type->release_clone_rq(clone);
                        return DM_MAPIO_REQUEUE;
@@ -1925,15 +2006,24 @@ static void map_tio_request(struct kthread_work *work)
        struct request *rq = tio->orig;
        struct mapped_device *md = tio->md;
-        if (map_request(tio->ti, rq, md) == DM_MAPIO_REQUEUE)
+        if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
                dm_requeue_unmapped_original_request(md, rq);
 }
 static void dm_start_request(struct mapped_device *md, struct request *orig)
 {
-        blk_start_request(orig);
+        if (!orig->q->mq_ops)
+                blk_start_request(orig);
+        else
+                blk_mq_start_request(orig);
        atomic_inc(&md->pending[rq_data_dir(orig)]);
+        if (md->seq_rq_merge_deadline_usecs) {
+                md->last_rq_pos = rq_end_sector(orig);
+                md->last_rq_rw = rq_data_dir(orig);
+                md->last_rq_start_time = ktime_get();
+        }
        /*
         * Hold the md reference here for the in-flight I/O.
         * We can't rely on the reference count by device opener,
@@ -1944,6 +2034,45 @@ static void dm_start_request(struct mapped_device *md, struct request *orig)
        dm_get(md);
 }
+#define MAX_SEQ_RQ_MERGE_DEADLINE_USECS 100000
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf)
+{
+        return sprintf(buf, "%u\n", md->seq_rq_merge_deadline_usecs);
+}
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
+                                                     const char *buf, size_t count)
+{
+        unsigned deadline;
+        if (!dm_request_based(md) || md->use_blk_mq)
+                return count;
+        if (kstrtouint(buf, 10, &deadline))
+                return -EINVAL;
+        if (deadline > MAX_SEQ_RQ_MERGE_DEADLINE_USECS)
+                deadline = MAX_SEQ_RQ_MERGE_DEADLINE_USECS;
+        md->seq_rq_merge_deadline_usecs = deadline;
+        return count;
+}
+static bool dm_request_peeked_before_merge_deadline(struct mapped_device *md)
+{
+        ktime_t kt_deadline;
+        if (!md->seq_rq_merge_deadline_usecs)
+                return false;
+        kt_deadline = ns_to_ktime((u64)md->seq_rq_merge_deadline_usecs * NSEC_PER_USEC);
+        kt_deadline = ktime_add_safe(md->last_rq_start_time, kt_deadline);
+        return !ktime_after(ktime_get(), kt_deadline);
+}
 /*
 * q->request_fn for request-based dm.
 * Called with the queue lock held.
@@ -1967,7 +2096,7 @@ static void dm_request_fn(struct request_queue *q)
        while (!blk_queue_stopped(q)) {
                rq = blk_peek_request(q);
                if (!rq)
-                        goto delay_and_out;
+                        goto out;
                /* always use block 0 to find the target for flushes for now */
                pos = 0;
@@ -1986,12 +2115,17 @@ static void dm_request_fn(struct request_queue *q)
                        continue;
                }
+                if (dm_request_peeked_before_merge_deadline(md) &&
+                    md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 &&
+                    md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq))
+                        goto delay_and_out;
                if (ti->type->busy && ti->type->busy(ti))
                        goto delay_and_out;
                dm_start_request(md, rq);
-                tio = rq->special;
+                tio = tio_from_request(rq);
                /* Establish tio->ti before queuing work (map_tio_request) */
                tio->ti = ti;
                queue_kthread_work(&md->kworker, &tio->work);
@@ -2001,33 +2135,11 @@ static void dm_request_fn(struct request_queue *q)
        goto out;
 delay_and_out:
-        blk_delay_queue(q, HZ / 10);
+        blk_delay_queue(q, HZ / 100);
 out:
        dm_put_live_table(md, srcu_idx);
 }
-int dm_underlying_device_busy(struct request_queue *q)
-{
-        return blk_lld_busy(q);
-}
-EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
-static int dm_lld_busy(struct request_queue *q)
-{
-        int r;
-        struct mapped_device *md = q->queuedata;
-        struct dm_table *map = dm_get_live_table_fast(md);
-        if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
-                r = 1;
-        else
-                r = dm_table_any_busy_target(map);
-        dm_put_live_table_fast(md);
-        return r;
-}
 static int dm_any_congested(void *congested_data, int bdi_bits)
 {
        int r = bdi_bits;
@@ -2110,7 +2222,7 @@ static void dm_init_md_queue(struct mapped_device *md)
 {
        /*
         * Request-based dm devices cannot be stacked on top of bio-based dm
-         * devices.  The type of this dm device has not been decided yet.
+         * devices.  The type of this dm device may not have been decided yet.
         * The type is decided at the first table loading time.
         * To prevent problematic device stacking, clear the queue flag
         * for request stacking support until then.
@@ -2118,13 +2230,21 @@ static void dm_init_md_queue(struct mapped_device *md)
         * This queue is new, so no concurrency on the queue_flags.
         */
        queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
+}
+static void dm_init_old_md_queue(struct mapped_device *md)
+{
+        md->use_blk_mq = false;
+        dm_init_md_queue(md);
+        /*
+         * Initialize aspects of queue that aren't relevant for blk-mq
+         */
        md->queue->queuedata = md;
        md->queue->backing_dev_info.congested_fn = dm_any_congested;
        md->queue->backing_dev_info.congested_data = md;
-        blk_queue_make_request(md->queue, dm_request);
        blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
-        blk_queue_merge_bvec(md->queue, dm_merge_bvec);
 }
 /*
@@ -2156,6 +2276,7 @@ static struct mapped_device *alloc_dev(int minor)
        if (r < 0)
                goto bad_io_barrier;
+        md->use_blk_mq = use_blk_mq;
        md->type = DM_TYPE_NONE;
        mutex_init(&md->suspend_lock);
        mutex_init(&md->type_lock);
@@ -2267,6 +2388,8 @@ static void free_dev(struct mapped_device *md)
        del_gendisk(md->disk);
        put_disk(md->disk);
        blk_cleanup_queue(md->queue);
+        if (md->use_blk_mq)
+                blk_mq_free_tag_set(&md->tag_set);
        bdput(md->bdev);
        free_minor(minor);
@@ -2278,7 +2401,7 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 {
        struct dm_md_mempools *p = dm_table_get_md_mempools(t);
-        if (md->io_pool && md->bs) {
+        if (md->bs) {
                /* The md already has necessary mempools. */
                if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) {
                        /*
@@ -2310,7 +2433,7 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
        p->bs = NULL;
 out:
-        /* mempool bind completed, now no need any mempools in the table */
+        /* mempool bind completed, no longer need any mempools in the table */
        dm_table_free_md_mempools(t);
 }
@@ -2357,7 +2480,7 @@ int dm_queue_merge_is_compulsory(struct request_queue *q)
        if (!q->merge_bvec_fn)
                return 0;
-        if (q->make_request_fn == dm_request) {
+        if (q->make_request_fn == dm_make_request) {
                dev_md = q->queuedata;
                if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))
                        return 0;
@@ -2426,7 +2549,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
         * This must be done before setting the queue restrictions,
         * because request-based dm may be run just after the setting.
         */
-        if (dm_table_request_based(t) && !blk_queue_stopped(q))
+        if (dm_table_request_based(t))
                stop_queue(q);
        __bind_mempools(md, t);
@@ -2508,14 +2631,6 @@ unsigned dm_get_md_type(struct mapped_device *md)
        return md->type;
 }
-static bool dm_md_type_request_based(struct mapped_device *md)
-{
-        unsigned table_type = dm_get_md_type(md);
-        return (table_type == DM_TYPE_REQUEST_BASED ||
-                table_type == DM_TYPE_MQ_REQUEST_BASED);
-}
 struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
 {
        return md->immutable_target_type;
@@ -2532,6 +2647,14 @@ struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
 }
 EXPORT_SYMBOL_GPL(dm_get_queue_limits);
+static void init_rq_based_worker_thread(struct mapped_device *md)
+{
+        /* Initialize the request-based DM worker thread */
+        init_kthread_worker(&md->kworker);
+        md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker,
+                                       "kdmwork-%s", dm_device_name(md));
+}
 /*
 * Fully initialize a request-based queue (->elevator, ->request_fn, etc).
 */
@@ -2540,27 +2663,160 @@ static int dm_init_request_based_queue(struct mapped_device *md)
        struct request_queue *q = NULL;
        if (md->queue->elevator)
-                return 1;
+                return 0;
        /* Fully initialize the queue */
        q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);
        if (!q)
-                return 0;
+                return -EINVAL;
+        /* disable dm_request_fn's merge heuristic by default */
+        md->seq_rq_merge_deadline_usecs = 0;
        md->queue = q;
-        dm_init_md_queue(md);
+        dm_init_old_md_queue(md);
        blk_queue_softirq_done(md->queue, dm_softirq_done);
        blk_queue_prep_rq(md->queue, dm_prep_fn);
-        blk_queue_lld_busy(md->queue, dm_lld_busy);
-        /* Also initialize the request-based DM worker thread */
+        init_rq_based_worker_thread(md);
-        init_kthread_worker(&md->kworker);
-        md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker,
-                                       "kdmwork-%s", dm_device_name(md));
        elv_register_queue(md->queue);
-        return 1;
+        return 0;
+}
+static int dm_mq_init_request(void *data, struct request *rq,
+                              unsigned int hctx_idx, unsigned int request_idx,
+                              unsigned int numa_node)
+{
+        struct mapped_device *md = data;
+        struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
+        /*
+         * Must initialize md member of tio, otherwise it won't
+         * be available in dm_mq_queue_rq.
+         */
+        tio->md = md;
+        return 0;
+}
+static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
+                          const struct blk_mq_queue_data *bd)
+{
+        struct request *rq = bd->rq;
+        struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
+        struct mapped_device *md = tio->md;
+        int srcu_idx;
+        struct dm_table *map = dm_get_live_table(md, &srcu_idx);
+        struct dm_target *ti;
+        sector_t pos;
+        /* always use block 0 to find the target for flushes for now */
+        pos = 0;
+        if (!(rq->cmd_flags & REQ_FLUSH))
+                pos = blk_rq_pos(rq);
+        ti = dm_table_find_target(map, pos);
+        if (!dm_target_is_valid(ti)) {
+                dm_put_live_table(md, srcu_idx);
+                DMERR_LIMIT("request attempted access beyond the end of device");
+                /*
+                 * Must perform setup, that rq_completed() requires,
+                 * before returning BLK_MQ_RQ_QUEUE_ERROR
+                 */
+                dm_start_request(md, rq);
+                return BLK_MQ_RQ_QUEUE_ERROR;
+        }
+        dm_put_live_table(md, srcu_idx);
+        if (ti->type->busy && ti->type->busy(ti))
+                return BLK_MQ_RQ_QUEUE_BUSY;
+        dm_start_request(md, rq);
+        /* Init tio using md established in .init_request */
+        init_tio(tio, rq, md);
+        /*
+         * Establish tio->ti before queuing work (map_tio_request)
+         * or making direct call to map_request().
+         */
+        tio->ti = ti;
+        /* Clone the request if underlying devices aren't blk-mq */
+        if (dm_table_get_type(map) == DM_TYPE_REQUEST_BASED) {
+                /* clone request is allocated at the end of the pdu */
+                tio->clone = (void *)blk_mq_rq_to_pdu(rq) + sizeof(struct dm_rq_target_io);
+                if (!clone_rq(rq, md, tio, GFP_ATOMIC))
+                        return BLK_MQ_RQ_QUEUE_BUSY;
+                queue_kthread_work(&md->kworker, &tio->work);
+        } else {
+                /* Direct call is fine since .queue_rq allows allocations */
+                if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
+                        dm_requeue_unmapped_original_request(md, rq);
+        }
+        return BLK_MQ_RQ_QUEUE_OK;
+}
+static struct blk_mq_ops dm_mq_ops = {
+        .queue_rq = dm_mq_queue_rq,
+        .map_queue = blk_mq_map_queue,
+        .complete = dm_softirq_done,
+        .init_request = dm_mq_init_request,
+};
+static int dm_init_request_based_blk_mq_queue(struct mapped_device *md)
+{
+        unsigned md_type = dm_get_md_type(md);
+        struct request_queue *q;
+        int err;
+        memset(&md->tag_set, 0, sizeof(md->tag_set));
+        md->tag_set.ops = &dm_mq_ops;
+        md->tag_set.queue_depth = BLKDEV_MAX_RQ;
+        md->tag_set.numa_node = NUMA_NO_NODE;
+        md->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+        md->tag_set.nr_hw_queues = 1;
+        if (md_type == DM_TYPE_REQUEST_BASED) {
+                /* make the memory for non-blk-mq clone part of the pdu */
+                md->tag_set.cmd_size = sizeof(struct dm_rq_target_io) + sizeof(struct request);
+        } else
+                md->tag_set.cmd_size = sizeof(struct dm_rq_target_io);
+        md->tag_set.driver_data = md;
+        err = blk_mq_alloc_tag_set(&md->tag_set);
+        if (err)
+                return err;
+        q = blk_mq_init_allocated_queue(&md->tag_set, md->queue);
+        if (IS_ERR(q)) {
+                err = PTR_ERR(q);
+                goto out_tag_set;
+        }
+        md->queue = q;
+        dm_init_md_queue(md);
+        /* backfill 'mq' sysfs registration normally done in blk_register_queue */
+        blk_mq_register_disk(md->disk);
+        if (md_type == DM_TYPE_REQUEST_BASED)
+                init_rq_based_worker_thread(md);
+        return 0;
+out_tag_set:
+        blk_mq_free_tag_set(&md->tag_set);
+        return err;
+}
+static unsigned filter_md_type(unsigned type, struct mapped_device *md)
+{
+        if (type == DM_TYPE_BIO_BASED)
+                return type;
+        return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED;
 }
 /*
@@ -2568,9 +2824,29 @@ static int dm_init_request_based_queue(struct mapped_device *md)
 */
 int dm_setup_md_queue(struct mapped_device *md)
 {
-        if (dm_md_type_request_based(md) && !dm_init_request_based_queue(md)) {
+        int r;
-                DMWARN("Cannot initialize queue for request-based mapped device");
+        unsigned md_type = filter_md_type(dm_get_md_type(md), md);
-                return -EINVAL;
+        switch (md_type) {
+        case DM_TYPE_REQUEST_BASED:
+                r = dm_init_request_based_queue(md);
+                if (r) {
+                        DMWARN("Cannot initialize queue for request-based mapped device");
+                        return r;
+                }
+                break;
+        case DM_TYPE_MQ_REQUEST_BASED:
+                r = dm_init_request_based_blk_mq_queue(md);
+                if (r) {
+                        DMWARN("Cannot initialize queue for request-based blk-mq mapped device");
+                        return r;
+                }
+                break;
+        case DM_TYPE_BIO_BASED:
+                dm_init_old_md_queue(md);
+                blk_queue_make_request(md->queue, dm_make_request);
+                blk_queue_merge_bvec(md->queue, dm_merge_bvec);
+                break;
        }
        return 0;
@@ -2654,7 +2930,7 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
        set_bit(DMF_FREEING, &md->flags);
        spin_unlock(&_minor_lock);
-        if (dm_request_based(md))
+        if (dm_request_based(md) && md->kworker_task)
                flush_kthread_worker(&md->kworker);
        /*
@@ -2908,7 +3184,8 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
         */
        if (dm_request_based(md)) {
                stop_queue(md->queue);
-                flush_kthread_worker(&md->kworker);
+                if (md->kworker_task)
+                        flush_kthread_worker(&md->kworker);
        }
        flush_workqueue(md->wq);
@@ -3206,6 +3483,7 @@ struct gendisk *dm_disk(struct mapped_device *md)
 {
        return md->disk;
 }
+EXPORT_SYMBOL_GPL(dm_disk);
 struct kobject *dm_kobject(struct mapped_device *md)
 {
@@ -3253,16 +3531,19 @@ int dm_noflush_suspending(struct dm_target *ti)
 }
 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
-struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size)
+struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
+                                            unsigned integrity, unsigned per_bio_data_size)
 {
        struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
-        struct kmem_cache *cachep;
+        struct kmem_cache *cachep = NULL;
        unsigned int pool_size = 0;
        unsigned int front_pad;
        if (!pools)
                return NULL;
+        type = filter_md_type(type, md);
        switch (type) {
        case DM_TYPE_BIO_BASED:
                cachep = _io_cache;
@@ -3270,13 +3551,13 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
                front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
                break;
        case DM_TYPE_REQUEST_BASED:
+                cachep = _rq_tio_cache;
                pool_size = dm_get_reserved_rq_based_ios();
                pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
                if (!pools->rq_pool)
                        goto out;
                /* fall through to setup remaining rq-based pools */
        case DM_TYPE_MQ_REQUEST_BASED:
-                cachep = _rq_tio_cache;
                if (!pool_size)
                        pool_size = dm_get_reserved_rq_based_ios();
                front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
@@ -3284,12 +3565,14 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
                WARN_ON(per_bio_data_size != 0);
                break;
        default:
-                goto out;
+                BUG();
        }
-        pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
+        if (cachep) {
-        if (!pools->io_pool)
+                pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
-                goto out;
+                if (!pools->io_pool)
+                        goto out;
+        }
        pools->bs = bioset_create_nobvec(pool_size, front_pad);
        if (!pools->bs)
@@ -3346,6 +3629,9 @@ MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
 module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
+module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices");
 MODULE_DESCRIPTION(DM_NAME " driver");
 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
 MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 59f53e79db82..6123c2bf9150 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -70,7 +70,6 @@ void dm_table_presuspend_undo_targets(struct dm_table *t);
 void dm_table_postsuspend_targets(struct dm_table *t);
 int dm_table_resume_targets(struct dm_table *t);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
-int dm_table_any_busy_target(struct dm_table *t);
 unsigned dm_table_get_type(struct dm_table *t);
 struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
 bool dm_table_request_based(struct dm_table *t);
@@ -212,6 +211,8 @@ int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
 void dm_internal_suspend(struct mapped_device *md);
 void dm_internal_resume(struct mapped_device *md);
+bool dm_use_blk_mq(struct mapped_device *md);
 int dm_io_init(void);
 void dm_io_exit(void);
@@ -221,7 +222,8 @@ void dm_kcopyd_exit(void);
 /*
 * Mempool operations
 */
-struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size);
+struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
+                                            unsigned integrity, unsigned per_bio_data_size);
 void dm_free_md_mempools(struct dm_md_mempools *pools);
 /*
@@ -235,4 +237,8 @@ static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen
        return !maxlen || strlen(result) + 1 >= maxlen;
 }
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf);
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
+                                                     const char *buf, size_t count);
 #endif
author	Linus Torvalds <torvalds@linux-foundation.org>	2015-04-18 08:14:18 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2015-04-18 08:14:18 -0400
commit	afad97eee47c1f1f242202e2473929b4ef5d9f43 (patch)
tree	31f68d70760234b582a28bd3f64311ff5307b7b1 /drivers/md
parent	04b7fe6a4a231871ef681bc95e08fe66992f7b1f (diff)
parent	44c144f9c8e8fbd73ede2848da8253b3aae42ec2 (diff)