dm cache: significant rework to leverage dm-bio-prison-v2

The cache policy interfaces have been updated to work well with the new bio-prison v2 interface's ability to queue work immediately (promotion, demotion, etc) -- overriding benefit being reduced latency on processing IO through the cache. Previously such work would be left for the DM cache core to queue on various lists and then process in batches later -- this caused a serious delay in latency for IO driven by the cache. The background tracker code was factored out so that all cache policies can make use of it. Also, the "cleaner" policy has been removed and is now a variant of the smq policy that simply disallows migrations. Signed-off-by: Joe Thornber <ejt@redhat.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
author: Joe Thornber <ejt@redhat.com> 2016-12-15 04:57:31 -0500
committer: Mike Snitzer <snitzer@redhat.com> 2017-03-07 13:28:31 -0500
commit: b29d4986d0da1a27cd35917cdb433672f5c95d7f (patch)
tree: a5d94b86cf1eb759bfef5761015135d747e80561 /drivers/md
parent: 742c8fdc31e820503f9267070311d894978d1349 (diff)
10 files changed, 1922 insertions, 2399 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index b7767da50c26..982cd0626bc7 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -325,14 +325,6 @@ config DM_CACHE_SMQ
         of less memory utilization, improved performance and increased
         adaptability in the face of changing workloads.
-config DM_CACHE_CLEANER
-       tristate "Cleaner Cache Policy (EXPERIMENTAL)"
-       depends on DM_CACHE
-       default y
-       ---help---
-         A simple cache policy that writes back all data to the
-         origin.  Used when decommissioning a dm-cache.
 config DM_ERA
       tristate "Era target (EXPERIMENTAL)"
       depends on BLK_DEV_DM
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index d378b1db7852..2801b2fb452d 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -13,9 +13,9 @@ dm-log-userspace-y \
                += dm-log-userspace-base.o dm-log-userspace-transfer.o
 dm-bio-prison-y += dm-bio-prison-v1.o dm-bio-prison-v2.o
 dm-thin-pool-y  += dm-thin.o dm-thin-metadata.o
-dm-cache-y      += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o
+dm-cache-y      += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \
+                    dm-cache-background-tracker.o
 dm-cache-smq-y   += dm-cache-policy-smq.o
-dm-cache-cleaner-y += dm-cache-policy-cleaner.o
 dm-era-y        += dm-era-target.o
 dm-verity-y     += dm-verity-target.o
 md-mod-y        += md.o bitmap.o
@@ -57,7 +57,6 @@ obj-$(CONFIG_DM_THIN_PROVISIONING)	+= dm-thin-pool.o
 obj-$(CONFIG_DM_VERITY)         += dm-verity.o
 obj-$(CONFIG_DM_CACHE)          += dm-cache.o
 obj-$(CONFIG_DM_CACHE_SMQ)      += dm-cache-smq.o
-obj-$(CONFIG_DM_CACHE_CLEANER)  += dm-cache-cleaner.o
 obj-$(CONFIG_DM_ERA)            += dm-era.o
 obj-$(CONFIG_DM_LOG_WRITES)     += dm-log-writes.o
diff --git a/drivers/md/dm-cache-background-tracker.c b/drivers/md/dm-cache-background-tracker.c
new file mode 100644
index 000000000000..9b1afdfb13f0
--- /dev/null
+++ b/drivers/md/dm-cache-background-tracker.c
@@ -0,0 +1,238 @@
+/*
+ * Copyright (C) 2017 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+#include "dm-cache-background-tracker.h"
+/*----------------------------------------------------------------*/
+#define DM_MSG_PREFIX "dm-background-tracker"
+struct bt_work {
+        struct list_head list;
+        struct rb_node node;
+        struct policy_work work;
+};
+struct background_tracker {
+        unsigned max_work;
+        atomic_t pending_promotes;
+        atomic_t pending_writebacks;
+        atomic_t pending_demotes;
+        struct list_head issued;
+        struct list_head queued;
+        struct rb_root pending;
+        struct kmem_cache *work_cache;
+};
+struct background_tracker *btracker_create(unsigned max_work)
+{
+        struct background_tracker *b = kmalloc(sizeof(*b), GFP_KERNEL);
+        b->max_work = max_work;
+        atomic_set(&b->pending_promotes, 0);
+        atomic_set(&b->pending_writebacks, 0);
+        atomic_set(&b->pending_demotes, 0);
+        INIT_LIST_HEAD(&b->issued);
+        INIT_LIST_HEAD(&b->queued);
+        b->pending = RB_ROOT;
+        b->work_cache = KMEM_CACHE(bt_work, 0);
+        if (!b->work_cache) {
+                DMERR("couldn't create mempool for background work items");
+                kfree(b);
+                b = NULL;
+        }
+        return b;
+}
+EXPORT_SYMBOL_GPL(btracker_create);
+void btracker_destroy(struct background_tracker *b)
+{
+        kmem_cache_destroy(b->work_cache);
+        kfree(b);
+}
+EXPORT_SYMBOL_GPL(btracker_destroy);
+static int cmp_oblock(dm_oblock_t lhs, dm_oblock_t rhs)
+{
+        if (from_oblock(lhs) < from_oblock(rhs))
+                return -1;
+        if (from_oblock(rhs) < from_oblock(lhs))
+                return 1;
+        return 0;
+}
+static bool __insert_pending(struct background_tracker *b,
+                             struct bt_work *nw)
+{
+        int cmp;
+        struct bt_work *w;
+        struct rb_node **new = &b->pending.rb_node, *parent = NULL;
+        while (*new) {
+                w = container_of(*new, struct bt_work, node);
+                parent = *new;
+                cmp = cmp_oblock(w->work.oblock, nw->work.oblock);
+                if (cmp < 0)
+                        new = &((*new)->rb_left);
+                else if (cmp > 0)
+                        new = &((*new)->rb_right);
+                else
+                        /* already present */
+                        return false;
+        }
+        rb_link_node(&nw->node, parent, new);
+        rb_insert_color(&nw->node, &b->pending);
+        return true;
+}
+static struct bt_work *__find_pending(struct background_tracker *b,
+                                      dm_oblock_t oblock)
+{
+        int cmp;
+        struct bt_work *w;
+        struct rb_node **new = &b->pending.rb_node;
+        while (*new) {
+                w = container_of(*new, struct bt_work, node);
+                cmp = cmp_oblock(w->work.oblock, oblock);
+                if (cmp < 0)
+                        new = &((*new)->rb_left);
+                else if (cmp > 0)
+                        new = &((*new)->rb_right);
+                else
+                        break;
+        }
+        return *new ? w : NULL;
+}
+static void update_stats(struct background_tracker *b, struct policy_work *w, int delta)
+{
+        switch (w->op) {
+        case POLICY_PROMOTE:
+                atomic_add(delta, &b->pending_promotes);
+                break;
+        case POLICY_DEMOTE:
+                atomic_add(delta, &b->pending_demotes);
+                break;
+        case POLICY_WRITEBACK:
+                atomic_add(delta, &b->pending_writebacks);
+                break;
+        }
+}
+unsigned btracker_nr_writebacks_queued(struct background_tracker *b)
+{
+        return atomic_read(&b->pending_writebacks);
+}
+EXPORT_SYMBOL_GPL(btracker_nr_writebacks_queued);
+unsigned btracker_nr_demotions_queued(struct background_tracker *b)
+{
+        return atomic_read(&b->pending_demotes);
+}
+EXPORT_SYMBOL_GPL(btracker_nr_demotions_queued);
+static bool max_work_reached(struct background_tracker *b)
+{
+        // FIXME: finish
+        return false;
+}
+int btracker_queue(struct background_tracker *b,
+                   struct policy_work *work,
+                   struct policy_work **pwork)
+{
+        struct bt_work *w;
+        if (pwork)
+                *pwork = NULL;
+        if (max_work_reached(b))
+                return -ENOMEM;
+        w = kmem_cache_alloc(b->work_cache, GFP_NOWAIT);
+        if (!w)
+                return -ENOMEM;
+        memcpy(&w->work, work, sizeof(*work));
+        if (!__insert_pending(b, w)) {
+                /*
+                 * There was a race, we'll just ignore this second
+                 * bit of work for the same oblock.
+                 */
+                kmem_cache_free(b->work_cache, w);
+                return -EINVAL;
+        }
+        if (pwork) {
+                *pwork = &w->work;
+                list_add(&w->list, &b->issued);
+        } else
+                list_add(&w->list, &b->queued);
+        update_stats(b, &w->work, 1);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(btracker_queue);
+/*
+ * Returns -ENODATA if there's no work.
+ */
+int btracker_issue(struct background_tracker *b, struct policy_work **work)
+{
+        struct bt_work *w;
+        if (list_empty(&b->queued))
+                return -ENODATA;
+        w = list_first_entry(&b->queued, struct bt_work, list);
+        list_move(&w->list, &b->issued);
+        *work = &w->work;
+        return 0;
+}
+EXPORT_SYMBOL_GPL(btracker_issue);
+void btracker_complete(struct background_tracker *b,
+                       struct policy_work *op)
+{
+        struct bt_work *w = container_of(op, struct bt_work, work);
+        update_stats(b, &w->work, -1);
+        rb_erase(&w->node, &b->pending);
+        list_del(&w->list);
+        kmem_cache_free(b->work_cache, w);
+}
+EXPORT_SYMBOL_GPL(btracker_complete);
+bool btracker_promotion_already_present(struct background_tracker *b,
+                                        dm_oblock_t oblock)
+{
+        return __find_pending(b, oblock) != NULL;
+}
+EXPORT_SYMBOL_GPL(btracker_promotion_already_present);
+/*----------------------------------------------------------------*/
diff --git a/drivers/md/dm-cache-background-tracker.h b/drivers/md/dm-cache-background-tracker.h
new file mode 100644
index 000000000000..27ab90dbc275
--- /dev/null
+++ b/drivers/md/dm-cache-background-tracker.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2017 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+#ifndef DM_CACHE_BACKGROUND_WORK_H
+#define DM_CACHE_BACKGROUND_WORK_H
+#include <linux/vmalloc.h>
+#include "dm-cache-policy.h"
+/*----------------------------------------------------------------*/
+struct background_work;
+struct background_tracker;
+/*
+ * FIXME: discuss lack of locking in all methods.
+ */
+struct background_tracker *btracker_create(unsigned max_work);
+void btracker_destroy(struct background_tracker *b);
+unsigned btracker_nr_writebacks_queued(struct background_tracker *b);
+unsigned btracker_nr_demotions_queued(struct background_tracker *b);
+/*
+ * returns -EINVAL iff the work is already queued.  -ENOMEM if the work
+ * couldn't be queued for another reason.
+ */
+int btracker_queue(struct background_tracker *b,
+                   struct policy_work *work,
+                   struct policy_work **pwork);
+/*
+ * Returns -ENODATA if there's no work.
+ */
+int btracker_issue(struct background_tracker *b, struct policy_work **work);
+void btracker_complete(struct background_tracker *b,
+                       struct policy_work *op);
+bool btracker_promotion_already_present(struct background_tracker *b,
+                                        dm_oblock_t oblock);
+/*----------------------------------------------------------------*/
+#endif
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index 4f07c08cf107..179ed5bf81a3 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -50,6 +50,8 @@
 #define DM_CACHE_FEATURE_COMPAT_RO_SUPP   0UL
 #define DM_CACHE_FEATURE_INCOMPAT_SUPP    0UL
+struct dm_cache_metadata;
 /*
 * Reopens or creates a new, empty metadata volume.  Returns an ERR_PTR on
 * failure.  If reopening then features must match.
diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c
deleted file mode 100644
index 2e8a8f1d8358..000000000000
--- a/drivers/md/dm-cache-policy-cleaner.c
+++ /dev/null
@@ -1,469 +0,0 @@
-/*
- * Copyright (C) 2012 Red Hat. All rights reserved.
- *
- * writeback cache policy supporting flushing out dirty cache blocks.
- *
- * This file is released under the GPL.
- */
-#include "dm-cache-policy.h"
-#include "dm.h"
-#include <linux/hash.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-/*----------------------------------------------------------------*/
-#define DM_MSG_PREFIX "cache cleaner"
-/* Cache entry struct. */
-struct wb_cache_entry {
-        struct list_head list;
-        struct hlist_node hlist;
-        dm_oblock_t oblock;
-        dm_cblock_t cblock;
-        bool dirty:1;
-        bool pending:1;
-};
-struct hash {
-        struct hlist_head *table;
-        dm_block_t hash_bits;
-        unsigned nr_buckets;
-};
-struct policy {
-        struct dm_cache_policy policy;
-        spinlock_t lock;
-        struct list_head free;
-        struct list_head clean;
-        struct list_head clean_pending;
-        struct list_head dirty;
-        /*
-         * We know exactly how many cblocks will be needed,
-         * so we can allocate them up front.
-         */
-        dm_cblock_t cache_size, nr_cblocks_allocated;
-        struct wb_cache_entry *cblocks;
-        struct hash chash;
-};
-/*----------------------------------------------------------------------------*/
-/*
- * Low-level functions.
- */
-static unsigned next_power(unsigned n, unsigned min)
-{
-        return roundup_pow_of_two(max(n, min));
-}
-static struct policy *to_policy(struct dm_cache_policy *p)
-{
-        return container_of(p, struct policy, policy);
-}
-static struct list_head *list_pop(struct list_head *q)
-{
-        struct list_head *r = q->next;
-        list_del(r);
-        return r;
-}
-/*----------------------------------------------------------------------------*/
-/* Allocate/free various resources. */
-static int alloc_hash(struct hash *hash, unsigned elts)
-{
-        hash->nr_buckets = next_power(elts >> 4, 16);
-        hash->hash_bits = __ffs(hash->nr_buckets);
-        hash->table = vzalloc(sizeof(*hash->table) * hash->nr_buckets);
-        return hash->table ? 0 : -ENOMEM;
-}
-static void free_hash(struct hash *hash)
-{
-        vfree(hash->table);
-}
-static int alloc_cache_blocks_with_hash(struct policy *p, dm_cblock_t cache_size)
-{
-        int r = -ENOMEM;
-        p->cblocks = vzalloc(sizeof(*p->cblocks) * from_cblock(cache_size));
-        if (p->cblocks) {
-                unsigned u = from_cblock(cache_size);
-                while (u--)
-                        list_add(&p->cblocks[u].list, &p->free);
-                p->nr_cblocks_allocated = 0;
-                /* Cache entries hash. */
-                r = alloc_hash(&p->chash, from_cblock(cache_size));
-                if (r)
-                        vfree(p->cblocks);
-        }
-        return r;
-}
-static void free_cache_blocks_and_hash(struct policy *p)
-{
-        free_hash(&p->chash);
-        vfree(p->cblocks);
-}
-static struct wb_cache_entry *alloc_cache_entry(struct policy *p)
-{
-        struct wb_cache_entry *e;
-        BUG_ON(from_cblock(p->nr_cblocks_allocated) >= from_cblock(p->cache_size));
-        e = list_entry(list_pop(&p->free), struct wb_cache_entry, list);
-        p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) + 1);
-        return e;
-}
-/*----------------------------------------------------------------------------*/
-/* Hash functions (lookup, insert, remove). */
-static struct wb_cache_entry *lookup_cache_entry(struct policy *p, dm_oblock_t oblock)
-{
-        struct hash *hash = &p->chash;
-        unsigned h = hash_64(from_oblock(oblock), hash->hash_bits);
-        struct wb_cache_entry *cur;
-        struct hlist_head *bucket = &hash->table[h];
-        hlist_for_each_entry(cur, bucket, hlist) {
-                if (cur->oblock == oblock) {
-                        /* Move upfront bucket for faster access. */
-                        hlist_del(&cur->hlist);
-                        hlist_add_head(&cur->hlist, bucket);
-                        return cur;
-                }
-        }
-        return NULL;
-}
-static void insert_cache_hash_entry(struct policy *p, struct wb_cache_entry *e)
-{
-        unsigned h = hash_64(from_oblock(e->oblock), p->chash.hash_bits);
-        hlist_add_head(&e->hlist, &p->chash.table[h]);
-}
-static void remove_cache_hash_entry(struct wb_cache_entry *e)
-{
-        hlist_del(&e->hlist);
-}
-/* Public interface (see dm-cache-policy.h */
-static int wb_map(struct dm_cache_policy *pe, dm_oblock_t oblock,
-                  bool can_block, bool can_migrate, bool discarded_oblock,
-                  struct bio *bio, struct policy_locker *locker,
-                  struct policy_result *result)
-{
-        struct policy *p = to_policy(pe);
-        struct wb_cache_entry *e;
-        unsigned long flags;
-        result->op = POLICY_MISS;
-        if (can_block)
-                spin_lock_irqsave(&p->lock, flags);
-        else if (!spin_trylock_irqsave(&p->lock, flags))
-                return -EWOULDBLOCK;
-        e = lookup_cache_entry(p, oblock);
-        if (e) {
-                result->op = POLICY_HIT;
-                result->cblock = e->cblock;
-        }
-        spin_unlock_irqrestore(&p->lock, flags);
-        return 0;
-}
-static int wb_lookup(struct dm_cache_policy *pe, dm_oblock_t oblock, dm_cblock_t *cblock)
-{
-        int r;
-        struct policy *p = to_policy(pe);
-        struct wb_cache_entry *e;
-        unsigned long flags;
-        if (!spin_trylock_irqsave(&p->lock, flags))
-                return -EWOULDBLOCK;
-        e = lookup_cache_entry(p, oblock);
-        if (e) {
-                *cblock = e->cblock;
-                r = 0;
-        } else
-                r = -ENOENT;
-        spin_unlock_irqrestore(&p->lock, flags);
-        return r;
-}
-static void __set_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock, bool set)
-{
-        struct policy *p = to_policy(pe);
-        struct wb_cache_entry *e;
-        e = lookup_cache_entry(p, oblock);
-        BUG_ON(!e);
-        if (set) {
-                if (!e->dirty) {
-                        e->dirty = true;
-                        list_move(&e->list, &p->dirty);
-                }
-        } else {
-                if (e->dirty) {
-                        e->pending = false;
-                        e->dirty = false;
-                        list_move(&e->list, &p->clean);
-                }
-        }
-}
-static void wb_set_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock)
-{
-        struct policy *p = to_policy(pe);
-        unsigned long flags;
-        spin_lock_irqsave(&p->lock, flags);
-        __set_clear_dirty(pe, oblock, true);
-        spin_unlock_irqrestore(&p->lock, flags);
-}
-static void wb_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock)
-{
-        struct policy *p = to_policy(pe);
-        unsigned long flags;
-        spin_lock_irqsave(&p->lock, flags);
-        __set_clear_dirty(pe, oblock, false);
-        spin_unlock_irqrestore(&p->lock, flags);
-}
-static void add_cache_entry(struct policy *p, struct wb_cache_entry *e)
-{
-        insert_cache_hash_entry(p, e);
-        if (e->dirty)
-                list_add(&e->list, &p->dirty);
-        else
-                list_add(&e->list, &p->clean);
-}
-static int wb_load_mapping(struct dm_cache_policy *pe,
-                           dm_oblock_t oblock, dm_cblock_t cblock,
-                           uint32_t hint, bool hint_valid)
-{
-        int r;
-        struct policy *p = to_policy(pe);
-        struct wb_cache_entry *e = alloc_cache_entry(p);
-        if (e) {
-                e->cblock = cblock;
-                e->oblock = oblock;
-                e->dirty = false; /* blocks default to clean */
-                add_cache_entry(p, e);
-                r = 0;
-        } else
-                r = -ENOMEM;
-        return r;
-}
-static void wb_destroy(struct dm_cache_policy *pe)
-{
-        struct policy *p = to_policy(pe);
-        free_cache_blocks_and_hash(p);
-        kfree(p);
-}
-static struct wb_cache_entry *__wb_force_remove_mapping(struct policy *p, dm_oblock_t oblock)
-{
-        struct wb_cache_entry *r = lookup_cache_entry(p, oblock);
-        BUG_ON(!r);
-        remove_cache_hash_entry(r);
-        list_del(&r->list);
-        return r;
-}
-static void wb_remove_mapping(struct dm_cache_policy *pe, dm_oblock_t oblock)
-{
-        struct policy *p = to_policy(pe);
-        struct wb_cache_entry *e;
-        unsigned long flags;
-        spin_lock_irqsave(&p->lock, flags);
-        e = __wb_force_remove_mapping(p, oblock);
-        list_add_tail(&e->list, &p->free);
-        BUG_ON(!from_cblock(p->nr_cblocks_allocated));
-        p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) - 1);
-        spin_unlock_irqrestore(&p->lock, flags);
-}
-static void wb_force_mapping(struct dm_cache_policy *pe,
-                                dm_oblock_t current_oblock, dm_oblock_t oblock)
-{
-        struct policy *p = to_policy(pe);
-        struct wb_cache_entry *e;
-        unsigned long flags;
-        spin_lock_irqsave(&p->lock, flags);
-        e = __wb_force_remove_mapping(p, current_oblock);
-        e->oblock = oblock;
-        add_cache_entry(p, e);
-        spin_unlock_irqrestore(&p->lock, flags);
-}
-static struct wb_cache_entry *get_next_dirty_entry(struct policy *p)
-{
-        struct list_head *l;
-        struct wb_cache_entry *r;
-        if (list_empty(&p->dirty))
-                return NULL;
-        l = list_pop(&p->dirty);
-        r = container_of(l, struct wb_cache_entry, list);
-        list_add(l, &p->clean_pending);
-        return r;
-}
-static int wb_writeback_work(struct dm_cache_policy *pe,
-                             dm_oblock_t *oblock,
-                             dm_cblock_t *cblock,
-                             bool critical_only)
-{
-        int r = -ENOENT;
-        struct policy *p = to_policy(pe);
-        struct wb_cache_entry *e;
-        unsigned long flags;
-        spin_lock_irqsave(&p->lock, flags);
-        e = get_next_dirty_entry(p);
-        if (e) {
-                *oblock = e->oblock;
-                *cblock = e->cblock;
-                r = 0;
-        }
-        spin_unlock_irqrestore(&p->lock, flags);
-        return r;
-}
-static dm_cblock_t wb_residency(struct dm_cache_policy *pe)
-{
-        return to_policy(pe)->nr_cblocks_allocated;
-}
-/* Init the policy plugin interface function pointers. */
-static void init_policy_functions(struct policy *p)
-{
-        p->policy.destroy = wb_destroy;
-        p->policy.map = wb_map;
-        p->policy.lookup = wb_lookup;
-        p->policy.set_dirty = wb_set_dirty;
-        p->policy.clear_dirty = wb_clear_dirty;
-        p->policy.load_mapping = wb_load_mapping;
-        p->policy.get_hint = NULL;
-        p->policy.remove_mapping = wb_remove_mapping;
-        p->policy.writeback_work = wb_writeback_work;
-        p->policy.force_mapping = wb_force_mapping;
-        p->policy.residency = wb_residency;
-        p->policy.tick = NULL;
-}
-static struct dm_cache_policy *wb_create(dm_cblock_t cache_size,
-                                         sector_t origin_size,
-                                         sector_t cache_block_size)
-{
-        int r;
-        struct policy *p = kzalloc(sizeof(*p), GFP_KERNEL);
-        if (!p)
-                return NULL;
-        init_policy_functions(p);
-        INIT_LIST_HEAD(&p->free);
-        INIT_LIST_HEAD(&p->clean);
-        INIT_LIST_HEAD(&p->clean_pending);
-        INIT_LIST_HEAD(&p->dirty);
-        p->cache_size = cache_size;
-        spin_lock_init(&p->lock);
-        /* Allocate cache entry structs and add them to free list. */
-        r = alloc_cache_blocks_with_hash(p, cache_size);
-        if (!r)
-                return &p->policy;
-        kfree(p);
-        return NULL;
-}
-/*----------------------------------------------------------------------------*/
-static struct dm_cache_policy_type wb_policy_type = {
-        .name = "cleaner",
-        .version = {1, 0, 0},
-        .hint_size = 4,
-        .owner = THIS_MODULE,
-        .create = wb_create
-};
-static int __init wb_init(void)
-{
-        int r = dm_cache_policy_register(&wb_policy_type);
-        if (r < 0)
-                DMERR("register failed %d", r);
-        else
-                DMINFO("version %u.%u.%u loaded",
-                       wb_policy_type.version[0],
-                       wb_policy_type.version[1],
-                       wb_policy_type.version[2]);
-        return r;
-}
-static void __exit wb_exit(void)
-{
-        dm_cache_policy_unregister(&wb_policy_type);
-}
-module_init(wb_init);
-module_exit(wb_exit);
-MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("cleaner cache policy");
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
index 808ee0e2b2c4..56f0a23f698c 100644
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -12,70 +12,65 @@
 /*----------------------------------------------------------------*/
-/*
+static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
- * Little inline functions that simplify calling the policy methods.
+                                int data_dir, bool fast_copy, bool *background_queued)
- */
-static inline int policy_map(struct dm_cache_policy *p, dm_oblock_t oblock,
-                             bool can_block, bool can_migrate, bool discarded_oblock,
-                             struct bio *bio, struct policy_locker *locker,
-                             struct policy_result *result)
 {
-        return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, locker, result);
+        return p->lookup(p, oblock, cblock, data_dir, fast_copy, background_queued);
 }
-static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
+static inline int policy_lookup_with_work(struct dm_cache_policy *p,
+                                          dm_oblock_t oblock, dm_cblock_t *cblock,
+                                          int data_dir, bool fast_copy,
+                                          struct policy_work **work)
 {
-        BUG_ON(!p->lookup);
+        if (!p->lookup_with_work) {
-        return p->lookup(p, oblock, cblock);
+                *work = NULL;
-}
+                return p->lookup(p, oblock, cblock, data_dir, fast_copy, NULL);
+        }
-static inline void policy_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+        return p->lookup_with_work(p, oblock, cblock, data_dir, fast_copy, work);
-{
-        if (p->set_dirty)
-                p->set_dirty(p, oblock);
 }
-static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+static inline int policy_get_background_work(struct dm_cache_policy *p,
+                                             bool idle, struct policy_work **result)
 {
-        if (p->clear_dirty)
+        return p->get_background_work(p, idle, result);
-                p->clear_dirty(p, oblock);
 }
-static inline int policy_load_mapping(struct dm_cache_policy *p,
+static inline void policy_complete_background_work(struct dm_cache_policy *p,
-                                      dm_oblock_t oblock, dm_cblock_t cblock,
+                                                   struct policy_work *work,
-                                      uint32_t hint, bool hint_valid)
+                                                   bool success)
 {
-        return p->load_mapping(p, oblock, cblock, hint, hint_valid);
+        return p->complete_background_work(p, work, success);
 }
-static inline uint32_t policy_get_hint(struct dm_cache_policy *p,
+static inline void policy_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
-                                       dm_cblock_t cblock)
 {
-        return p->get_hint ? p->get_hint(p, cblock) : 0;
+        p->set_dirty(p, cblock);
 }
-static inline int policy_writeback_work(struct dm_cache_policy *p,
+static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
-                                        dm_oblock_t *oblock,
-                                        dm_cblock_t *cblock,
-                                        bool critical_only)
 {
-        return p->writeback_work ? p->writeback_work(p, oblock, cblock, critical_only) : -ENOENT;
+        p->clear_dirty(p, cblock);
 }
-static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
+static inline int policy_load_mapping(struct dm_cache_policy *p,
+                                      dm_oblock_t oblock, dm_cblock_t cblock,
+                                      bool dirty, uint32_t hint, bool hint_valid)
 {
-        p->remove_mapping(p, oblock);
+        return p->load_mapping(p, oblock, cblock, dirty, hint, hint_valid);
 }
-static inline int policy_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
+static inline int policy_invalidate_mapping(struct dm_cache_policy *p,
+                                            dm_cblock_t cblock)
 {
-        return p->remove_cblock(p, cblock);
+        return p->invalidate_mapping(p, cblock);
 }
-static inline void policy_force_mapping(struct dm_cache_policy *p,
+static inline uint32_t policy_get_hint(struct dm_cache_policy *p,
-                                        dm_oblock_t current_oblock, dm_oblock_t new_oblock)
+                                       dm_cblock_t cblock)
 {
-        return p->force_mapping(p, current_oblock, new_oblock);
+        return p->get_hint ? p->get_hint(p, cblock) : 0;
 }
 static inline dm_cblock_t policy_residency(struct dm_cache_policy *p)
@@ -107,6 +102,11 @@ static inline int policy_set_config_value(struct dm_cache_policy *p,
        return p->set_config_value ? p->set_config_value(p, key, value) : -EINVAL;
 }
+static inline void policy_allow_migrations(struct dm_cache_policy *p, bool allow)
+{
+        return p->allow_migrations(p, allow);
+}
 /*----------------------------------------------------------------*/
 /*
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index f19c6930a67c..74436dc2122f 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -4,8 +4,9 @@
 * This file is released under the GPL.
 */
-#include "dm-cache-policy.h"
+#include "dm-cache-background-tracker.h"
 #include "dm-cache-policy-internal.h"
+#include "dm-cache-policy.h"
 #include "dm.h"
 #include <linux/hash.h>
@@ -38,10 +39,11 @@ struct entry {
        unsigned hash_next:28;
        unsigned prev:28;
        unsigned next:28;
-        unsigned level:7;
+        unsigned level:6;
        bool dirty:1;
        bool allocated:1;
        bool sentinel:1;
+        bool pending_work:1;
        dm_oblock_t oblock;
 };
@@ -279,14 +281,28 @@ static unsigned q_size(struct queue *q)
 */
 static void q_push(struct queue *q, struct entry *e)
 {
+        BUG_ON(e->pending_work);
        if (!e->sentinel)
                q->nr_elts++;
        l_add_tail(q->es, q->qs + e->level, e);
 }
+static void q_push_front(struct queue *q, struct entry *e)
+{
+        BUG_ON(e->pending_work);
+        if (!e->sentinel)
+                q->nr_elts++;
+        l_add_head(q->es, q->qs + e->level, e);
+}
 static void q_push_before(struct queue *q, struct entry *old, struct entry *e)
 {
+        BUG_ON(e->pending_work);
        if (!e->sentinel)
                q->nr_elts++;
@@ -336,19 +352,6 @@ static struct entry *q_pop(struct queue *q)
 }
 /*
- * Pops an entry from a level that is not past a sentinel.
- */
-static struct entry *q_pop_old(struct queue *q, unsigned max_level)
-{
-        struct entry *e = q_peek(q, max_level, false);
-        if (e)
-                q_del(q, e);
-        return e;
-}
-/*
 * This function assumes there is a non-sentinel entry to pop.  It's only
 * used by redistribute, so we know this is true.  It also doesn't adjust
 * the q->nr_elts count.
@@ -446,45 +449,49 @@ static void q_redistribute(struct queue *q)
                                break;
                        e->level = level + 1u;
-                        l_add_head(q->es, l_above, e);
+                        l_add_tail(q->es, l_above, e);
                }
        }
 }
-static void q_requeue_before(struct queue *q, struct entry *dest, struct entry *e, unsigned extra_levels)
+static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels,
+                      struct entry *s1, struct entry *s2)
 {
        struct entry *de;
-        unsigned new_level;
+        unsigned sentinels_passed = 0;
+        unsigned new_level = min(q->nr_levels - 1u, e->level + extra_levels);
-        q_del(q, e);
+        /* try and find an entry to swap with */
        if (extra_levels && (e->level < q->nr_levels - 1u)) {
-                new_level = min(q->nr_levels - 1u, e->level + extra_levels);
+                for (de = l_head(q->es, q->qs + new_level); de && de->sentinel; de = l_next(q->es, de))
-                for (de = l_head(q->es, q->qs + new_level); de; de = l_next(q->es, de)) {
+                        sentinels_passed++;
-                        if (de->sentinel)
-                                continue;
+                if (de) {
                        q_del(q, de);
                        de->level = e->level;
+                        if (s1) {
+                                switch (sentinels_passed) {
+                                case 0:
+                                        q_push_before(q, s1, de);
+                                        break;
+                                case 1:
+                                        q_push_before(q, s2, de);
+                                        break;
-                        if (dest)
+                                default:
-                                q_push_before(q, dest, de);
+                                        q_push(q, de);
-                        else
+                                }
+                        } else
                                q_push(q, de);
-                        break;
                }
-                e->level = new_level;
        }
+        q_del(q, e);
+        e->level = new_level;
        q_push(q, e);
 }
-static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels)
-{
-        q_requeue_before(q, NULL, e, extra_levels);
-}
 /*----------------------------------------------------------------*/
 #define FP_SHIFT 8
@@ -550,7 +557,7 @@ static enum performance stats_assess(struct stats *s)
 /*----------------------------------------------------------------*/
-struct hash_table {
+struct smq_hash_table {
        struct entry_space *es;
        unsigned long long hash_bits;
        unsigned *buckets;
@@ -560,7 +567,7 @@ struct hash_table {
 * All cache entries are stored in a chained hash table.  To save space we
 * use indexing again, and only store indexes to the next entry.
 */
-static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_entries)
+static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned nr_entries)
 {
        unsigned i, nr_buckets;
@@ -578,34 +585,34 @@ static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_ent
        return 0;
 }
-static void h_exit(struct hash_table *ht)
+static void h_exit(struct smq_hash_table *ht)
 {
        vfree(ht->buckets);
 }
-static struct entry *h_head(struct hash_table *ht, unsigned bucket)
+static struct entry *h_head(struct smq_hash_table *ht, unsigned bucket)
 {
        return to_entry(ht->es, ht->buckets[bucket]);
 }
-static struct entry *h_next(struct hash_table *ht, struct entry *e)
+static struct entry *h_next(struct smq_hash_table *ht, struct entry *e)
 {
        return to_entry(ht->es, e->hash_next);
 }
-static void __h_insert(struct hash_table *ht, unsigned bucket, struct entry *e)
+static void __h_insert(struct smq_hash_table *ht, unsigned bucket, struct entry *e)
 {
        e->hash_next = ht->buckets[bucket];
        ht->buckets[bucket] = to_index(ht->es, e);
 }
-static void h_insert(struct hash_table *ht, struct entry *e)
+static void h_insert(struct smq_hash_table *ht, struct entry *e)
 {
        unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
        __h_insert(ht, h, e);
 }
-static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t oblock,
+static struct entry *__h_lookup(struct smq_hash_table *ht, unsigned h, dm_oblock_t oblock,
                                struct entry **prev)
 {
        struct entry *e;
@@ -621,7 +628,7 @@ static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t o
        return NULL;
 }
-static void __h_unlink(struct hash_table *ht, unsigned h,
+static void __h_unlink(struct smq_hash_table *ht, unsigned h,
                       struct entry *e, struct entry *prev)
 {
        if (prev)
@@ -633,7 +640,7 @@ static void __h_unlink(struct hash_table *ht, unsigned h,
 /*
 * Also moves each entry to the front of the bucket.
 */
-static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock)
+static struct entry *h_lookup(struct smq_hash_table *ht, dm_oblock_t oblock)
 {
        struct entry *e, *prev;
        unsigned h = hash_64(from_oblock(oblock), ht->hash_bits);
@@ -651,7 +658,7 @@ static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock)
        return e;
 }
-static void h_remove(struct hash_table *ht, struct entry *e)
+static void h_remove(struct smq_hash_table *ht, struct entry *e)
 {
        unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
        struct entry *prev;
@@ -699,7 +706,10 @@ static void init_entry(struct entry *e)
        e->next = INDEXER_NULL;
        e->prev = INDEXER_NULL;
        e->level = 0u;
+        e->dirty = true;        /* FIXME: audit */
        e->allocated = true;
+        e->sentinel = false;
+        e->pending_work = false;
 }
 static struct entry *alloc_entry(struct entry_alloc *ea)
@@ -762,11 +772,11 @@ static struct entry *get_entry(struct entry_alloc *ea, unsigned index)
 #define NR_HOTSPOT_LEVELS 64u
 #define NR_CACHE_LEVELS 64u
-#define WRITEBACK_PERIOD (10 * HZ)
+#define WRITEBACK_PERIOD (10ul * HZ)
-#define DEMOTE_PERIOD (60 * HZ)
+#define DEMOTE_PERIOD (60ul * HZ)
 #define HOTSPOT_UPDATE_PERIOD (HZ)
-#define CACHE_UPDATE_PERIOD (10u * HZ)
+#define CACHE_UPDATE_PERIOD (60ul * HZ)
 struct smq_policy {
        struct dm_cache_policy policy;
@@ -814,8 +824,8 @@ struct smq_policy {
         * The hash tables allows us to quickly find an entry by origin
         * block.
         */
-        struct hash_table table;
+        struct smq_hash_table table;
-        struct hash_table hotspot_table;
+        struct smq_hash_table hotspot_table;
        bool current_writeback_sentinels;
        unsigned long next_writeback_period;
@@ -828,6 +838,10 @@ struct smq_policy {
        unsigned long next_hotspot_period;
        unsigned long next_cache_period;
+        struct background_tracker *bg_work;
+        bool migrations_allowed;
 };
 /*----------------------------------------------------------------*/
@@ -876,15 +890,15 @@ static void __update_demote_sentinels(struct smq_policy *mq)
 static void update_sentinels(struct smq_policy *mq)
 {
        if (time_after(jiffies, mq->next_writeback_period)) {
-                __update_writeback_sentinels(mq);
                mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
                mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
+                __update_writeback_sentinels(mq);
        }
        if (time_after(jiffies, mq->next_demote_period)) {
-                __update_demote_sentinels(mq);
                mq->next_demote_period = jiffies + DEMOTE_PERIOD;
                mq->current_demote_sentinels = !mq->current_demote_sentinels;
+                __update_demote_sentinels(mq);
        }
 }
@@ -920,55 +934,40 @@ static void sentinels_init(struct smq_policy *mq)
 /*----------------------------------------------------------------*/
-/*
+static void del_queue(struct smq_policy *mq, struct entry *e)
- * These methods tie together the dirty queue, clean queue and hash table.
- */
-static void push_new(struct smq_policy *mq, struct entry *e)
 {
-        struct queue *q = e->dirty ? &mq->dirty : &mq->clean;
+        q_del(e->dirty ? &mq->dirty : &mq->clean, e);
-        h_insert(&mq->table, e);
-        q_push(q, e);
 }
-static void push(struct smq_policy *mq, struct entry *e)
+static void push_queue(struct smq_policy *mq, struct entry *e)
 {
-        struct entry *sentinel;
+        if (e->dirty)
+                q_push(&mq->dirty, e);
-        h_insert(&mq->table, e);
+        else
+                q_push(&mq->clean, e);
-        /*
-         * Punch this into the queue just in front of the sentinel, to
-         * ensure it's cleaned straight away.
-         */
-        if (e->dirty) {
-                sentinel = writeback_sentinel(mq, e->level);
-                q_push_before(&mq->dirty, sentinel, e);
-        } else {
-                sentinel = demote_sentinel(mq, e->level);
-                q_push_before(&mq->clean, sentinel, e);
-        }
 }
-/*
+// !h, !q, a -> h, q, a
- * Removes an entry from cache.  Removes from the hash table.
+static void push(struct smq_policy *mq, struct entry *e)
- */
-static void __del(struct smq_policy *mq, struct queue *q, struct entry *e)
 {
-        q_del(q, e);
+        h_insert(&mq->table, e);
-        h_remove(&mq->table, e);
+        if (!e->pending_work)
+                push_queue(mq, e);
 }
-static void del(struct smq_policy *mq, struct entry *e)
+static void push_queue_front(struct smq_policy *mq, struct entry *e)
 {
-        __del(mq, e->dirty ? &mq->dirty : &mq->clean, e);
+        if (e->dirty)
+                q_push_front(&mq->dirty, e);
+        else
+                q_push_front(&mq->clean, e);
 }
-static struct entry *pop_old(struct smq_policy *mq, struct queue *q, unsigned max_level)
+static void push_front(struct smq_policy *mq, struct entry *e)
 {
-        struct entry *e = q_pop_old(q, max_level);
+        h_insert(&mq->table, e);
-        if (e)
+        if (!e->pending_work)
-                h_remove(&mq->table, e);
+                push_queue_front(mq, e);
-        return e;
 }
 static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
@@ -978,16 +977,21 @@ static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
 static void requeue(struct smq_policy *mq, struct entry *e)
 {
-        struct entry *sentinel;
+        /*
+         * Pending work has temporarily been taken out of the queues.
+         */
+        if (e->pending_work)
+                return;
        if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) {
-                if (e->dirty) {
+                if (!e->dirty) {
-                        sentinel = writeback_sentinel(mq, e->level);
+                        q_requeue(&mq->clean, e, 1u, NULL, NULL);
-                        q_requeue_before(&mq->dirty, sentinel, e, 1u);
+                        return;
-                } else {
-                        sentinel = demote_sentinel(mq, e->level);
-                        q_requeue_before(&mq->clean, sentinel, e, 1u);
                }
+                q_requeue(&mq->dirty, e, 1u,
+                          get_sentinel(&mq->writeback_sentinel_alloc, e->level, !mq->current_writeback_sentinels),
+                          get_sentinel(&mq->writeback_sentinel_alloc, e->level, mq->current_writeback_sentinels));
        }
 }
@@ -1026,6 +1030,8 @@ static void update_promote_levels(struct smq_policy *mq)
        unsigned threshold_level = allocator_empty(&mq->cache_alloc) ?
                default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u);
+        threshold_level = max(threshold_level, NR_HOTSPOT_LEVELS);
        /*
         * If the hotspot queue is performing badly then we have little
         * confidence that we know which blocks to promote.  So we cut down
@@ -1045,7 +1051,7 @@ static void update_promote_levels(struct smq_policy *mq)
        }
        mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level;
-        mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level) + 2u;
+        mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level);
 }
 /*
@@ -1095,34 +1101,142 @@ static void end_cache_period(struct smq_policy *mq)
        }
 }
-static int demote_cblock(struct smq_policy *mq,
+/*----------------------------------------------------------------*/
-                         struct policy_locker *locker,
-                         dm_oblock_t *oblock)
+/*
+ * Targets are given as a percentage.
+ */
+#define CLEAN_TARGET 25u
+#define FREE_TARGET 25u
+static unsigned percent_to_target(struct smq_policy *mq, unsigned p)
 {
-        struct entry *demoted = q_peek(&mq->clean, mq->clean.nr_levels, false);
+        return from_cblock(mq->cache_size) * p / 100u;
-        if (!demoted)
+}
-                /*
-                 * We could get a block from mq->dirty, but that
+static bool clean_target_met(struct smq_policy *mq, bool idle)
-                 * would add extra latency to the triggering bio as it
+{
-                 * waits for the writeback.  Better to not promote this
+        /*
-                 * time and hope there's a clean block next time this block
+         * Cache entries may not be populated.  So we cannot rely on the
-                 * is hit.
+         * size of the clean queue.
-                 */
+         */
-                return -ENOSPC;
+        unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty);
-        if (locker->fn(locker, demoted->oblock))
+        if (idle)
                /*
-                 * We couldn't lock this block.
+                 * We'd like to clean everything.
                 */
-                return -EBUSY;
+                return q_size(&mq->dirty) == 0u;
+        else
+                return (nr_clean + btracker_nr_writebacks_queued(mq->bg_work)) >=
+                       percent_to_target(mq, CLEAN_TARGET);
+}
-        del(mq, demoted);
+static bool free_target_met(struct smq_policy *mq, bool idle)
-        *oblock = demoted->oblock;
+{
-        free_entry(&mq->cache_alloc, demoted);
+        unsigned nr_free = from_cblock(mq->cache_size) -
+                           mq->cache_alloc.nr_allocated;
-        return 0;
+        if (idle)
+                return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >=
+                       percent_to_target(mq, FREE_TARGET);
+        else
+                return true;
 }
+/*----------------------------------------------------------------*/
+static void mark_pending(struct smq_policy *mq, struct entry *e)
+{
+        BUG_ON(e->sentinel);
+        BUG_ON(!e->allocated);
+        BUG_ON(e->pending_work);
+        e->pending_work = true;
+}
+static void clear_pending(struct smq_policy *mq, struct entry *e)
+{
+        BUG_ON(!e->pending_work);
+        e->pending_work = false;
+}
+static void queue_writeback(struct smq_policy *mq)
+{
+        int r;
+        struct policy_work work;
+        struct entry *e;
+        e = q_peek(&mq->dirty, mq->dirty.nr_levels, false);
+        if (e) {
+                mark_pending(mq, e);
+                q_del(&mq->dirty, e);
+                work.op = POLICY_WRITEBACK;
+                work.oblock = e->oblock;
+                work.cblock = infer_cblock(mq, e);
+                r = btracker_queue(mq->bg_work, &work, NULL);
+                WARN_ON_ONCE(r); // FIXME: finish, I think we have to get rid of this race.
+        }
+}
+static void queue_demotion(struct smq_policy *mq)
+{
+        struct policy_work work;
+        struct entry *e;
+        if (unlikely(WARN_ON_ONCE(!mq->migrations_allowed)))
+                return;
+        e = q_peek(&mq->clean, mq->clean.nr_levels, true);
+        if (!e) {
+                if (!clean_target_met(mq, false))
+                        queue_writeback(mq);
+                return;
+        }
+        mark_pending(mq, e);
+        q_del(&mq->clean, e);
+        work.op = POLICY_DEMOTE;
+        work.oblock = e->oblock;
+        work.cblock = infer_cblock(mq, e);
+        btracker_queue(mq->bg_work, &work, NULL);
+}
+static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock,
+                            struct policy_work **workp)
+{
+        struct entry *e;
+        struct policy_work work;
+        if (!mq->migrations_allowed)
+                return;
+        if (allocator_empty(&mq->cache_alloc)) {
+                if (!free_target_met(mq, false))
+                        queue_demotion(mq);
+                return;
+        }
+        if (btracker_promotion_already_present(mq->bg_work, oblock))
+                return;
+        /*
+         * We allocate the entry now to reserve the cblock.  If the
+         * background work is aborted we must remember to free it.
+         */
+        e = alloc_entry(&mq->cache_alloc);
+        BUG_ON(!e);
+        e->pending_work = true;
+        work.op = POLICY_PROMOTE;
+        work.oblock = oblock;
+        work.cblock = infer_cblock(mq, e);
+        btracker_queue(mq->bg_work, &work, workp);
+}
+/*----------------------------------------------------------------*/
 enum promote_result {
        PROMOTE_NOT,
        PROMOTE_TEMPORARY,
@@ -1137,49 +1251,18 @@ static enum promote_result maybe_promote(bool promote)
        return promote ? PROMOTE_PERMANENT : PROMOTE_NOT;
 }
-static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e, struct bio *bio,
+static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e,
-                                          bool fast_promote)
+                                          int data_dir, bool fast_promote)
 {
-        if (bio_data_dir(bio) == WRITE) {
+        if (data_dir == WRITE) {
                if (!allocator_empty(&mq->cache_alloc) && fast_promote)
                        return PROMOTE_TEMPORARY;
-                else
+                return maybe_promote(hs_e->level >= mq->write_promote_level);
-                        return maybe_promote(hs_e->level >= mq->write_promote_level);
        } else
                return maybe_promote(hs_e->level >= mq->read_promote_level);
 }
-static void insert_in_cache(struct smq_policy *mq, dm_oblock_t oblock,
-                            struct policy_locker *locker,
-                            struct policy_result *result, enum promote_result pr)
-{
-        int r;
-        struct entry *e;
-        if (allocator_empty(&mq->cache_alloc)) {
-                result->op = POLICY_REPLACE;
-                r = demote_cblock(mq, locker, &result->old_oblock);
-                if (r) {
-                        result->op = POLICY_MISS;
-                        return;
-                }
-        } else
-                result->op = POLICY_NEW;
-        e = alloc_entry(&mq->cache_alloc);
-        BUG_ON(!e);
-        e->oblock = oblock;
-        if (pr == PROMOTE_TEMPORARY)
-                push(mq, e);
-        else
-                push_new(mq, e);
-        result->cblock = infer_cblock(mq, e);
-}
 static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
 {
        sector_t r = from_oblock(b);
@@ -1187,7 +1270,7 @@ static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
        return to_oblock(r);
 }
-static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b, struct bio *bio)
+static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b)
 {
        unsigned hi;
        dm_oblock_t hb = to_hblock(mq, b);
@@ -1199,7 +1282,8 @@ static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b,
                hi = get_index(&mq->hotspot_alloc, e);
                q_requeue(&mq->hotspot, e,
                          test_and_set_bit(hi, mq->hotspot_hit_bits) ?
-                          0u : mq->hotspot_level_jump);
+                          0u : mq->hotspot_level_jump,
+                          NULL, NULL);
        } else {
                stats_miss(&mq->hotspot_stats);
@@ -1225,47 +1309,6 @@ static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b,
        return e;
 }
-/*
- * Looks the oblock up in the hash table, then decides whether to put in
- * pre_cache, or cache etc.
- */
-static int map(struct smq_policy *mq, struct bio *bio, dm_oblock_t oblock,
-               bool can_migrate, bool fast_promote,
-               struct policy_locker *locker, struct policy_result *result)
-{
-        struct entry *e, *hs_e;
-        enum promote_result pr;
-        hs_e = update_hotspot_queue(mq, oblock, bio);
-        e = h_lookup(&mq->table, oblock);
-        if (e) {
-                stats_level_accessed(&mq->cache_stats, e->level);
-                requeue(mq, e);
-                result->op = POLICY_HIT;
-                result->cblock = infer_cblock(mq, e);
-        } else {
-                stats_miss(&mq->cache_stats);
-                pr = should_promote(mq, hs_e, bio, fast_promote);
-                if (pr == PROMOTE_NOT)
-                        result->op = POLICY_MISS;
-                else {
-                        if (!can_migrate) {
-                                result->op = POLICY_MISS;
-                                return -EWOULDBLOCK;
-                        }
-                        insert_in_cache(mq, oblock, locker, result, pr);
-                }
-        }
-        return 0;
-}
 /*----------------------------------------------------------------*/
 /*
@@ -1282,6 +1325,7 @@ static void smq_destroy(struct dm_cache_policy *p)
 {
        struct smq_policy *mq = to_smq_policy(p);
+        btracker_destroy(mq->bg_work);
        h_exit(&mq->hotspot_table);
        h_exit(&mq->table);
        free_bitset(mq->hotspot_hit_bits);
@@ -1290,234 +1334,247 @@ static void smq_destroy(struct dm_cache_policy *p)
        kfree(mq);
 }
-static int smq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
+/*----------------------------------------------------------------*/
-                   bool can_block, bool can_migrate, bool fast_promote,
-                   struct bio *bio, struct policy_locker *locker,
-                   struct policy_result *result)
-{
-        int r;
-        unsigned long flags;
-        struct smq_policy *mq = to_smq_policy(p);
-        result->op = POLICY_MISS;
-        spin_lock_irqsave(&mq->lock, flags);
-        r = map(mq, bio, oblock, can_migrate, fast_promote, locker, result);
-        spin_unlock_irqrestore(&mq->lock, flags);
-        return r;
-}
-static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
+static int __lookup(struct smq_policy *mq, dm_oblock_t oblock, dm_cblock_t *cblock,
+                    int data_dir, bool fast_copy,
+                    struct policy_work **work, bool *background_work)
 {
-        int r;
+        struct entry *e, *hs_e;
-        unsigned long flags;
+        enum promote_result pr;
-        struct smq_policy *mq = to_smq_policy(p);
-        struct entry *e;
+        *background_work = false;
-        spin_lock_irqsave(&mq->lock, flags);
        e = h_lookup(&mq->table, oblock);
        if (e) {
+                stats_level_accessed(&mq->cache_stats, e->level);
+                requeue(mq, e);
                *cblock = infer_cblock(mq, e);
-                r = 0;
+                return 0;
-        } else
-                r = -ENOENT;
-        spin_unlock_irqrestore(&mq->lock, flags);
-        return r;
+        } else {
-}
+                stats_miss(&mq->cache_stats);
-static void __smq_set_clear_dirty(struct smq_policy *mq, dm_oblock_t oblock, bool set)
+                /*
-{
+                 * The hotspot queue only gets updated with misses.
-        struct entry *e;
+                 */
+                hs_e = update_hotspot_queue(mq, oblock);
-        e = h_lookup(&mq->table, oblock);
+                pr = should_promote(mq, hs_e, data_dir, fast_copy);
-        BUG_ON(!e);
+                if (pr != PROMOTE_NOT) {
+                        queue_promotion(mq, oblock, work);
+                        *background_work = true;
+                }
-        del(mq, e);
+                return -ENOENT;
-        e->dirty = set;
+        }
-        push(mq, e);
 }
-static void smq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
+                      int data_dir, bool fast_copy,
+                      bool *background_work)
 {
+        int r;
        unsigned long flags;
        struct smq_policy *mq = to_smq_policy(p);
        spin_lock_irqsave(&mq->lock, flags);
-        __smq_set_clear_dirty(mq, oblock, true);
+        r = __lookup(mq, oblock, cblock,
+                     data_dir, fast_copy,
+                     NULL, background_work);
        spin_unlock_irqrestore(&mq->lock, flags);
+        return r;
 }
-static void smq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+static int smq_lookup_with_work(struct dm_cache_policy *p,
+                                dm_oblock_t oblock, dm_cblock_t *cblock,
+                                int data_dir, bool fast_copy,
+                                struct policy_work **work)
 {
-        struct smq_policy *mq = to_smq_policy(p);
+        int r;
+        bool background_queued;
        unsigned long flags;
+        struct smq_policy *mq = to_smq_policy(p);
        spin_lock_irqsave(&mq->lock, flags);
-        __smq_set_clear_dirty(mq, oblock, false);
+        r = __lookup(mq, oblock, cblock, data_dir, fast_copy, work, &background_queued);
        spin_unlock_irqrestore(&mq->lock, flags);
-}
-static unsigned random_level(dm_cblock_t cblock)
+        return r;
-{
-        return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1);
 }
-static int smq_load_mapping(struct dm_cache_policy *p,
+static int smq_get_background_work(struct dm_cache_policy *p, bool idle,
-                            dm_oblock_t oblock, dm_cblock_t cblock,
+                                   struct policy_work **result)
-                            uint32_t hint, bool hint_valid)
 {
+        int r;
+        unsigned long flags;
        struct smq_policy *mq = to_smq_policy(p);
-        struct entry *e;
-        e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
+        spin_lock_irqsave(&mq->lock, flags);
-        e->oblock = oblock;
+        r = btracker_issue(mq->bg_work, result);
-        e->dirty = false;       /* this gets corrected in a minute */
+        if (r == -ENODATA) {
-        e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock);
+                /* find some writeback work to do */
-        push(mq, e);
+                if (mq->migrations_allowed && !free_target_met(mq, idle))
+                        queue_demotion(mq);
-        return 0;
-}
-static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock)
+                else if (!clean_target_met(mq, idle))
-{
+                        queue_writeback(mq);
-        struct smq_policy *mq = to_smq_policy(p);
-        struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
-        if (!e->allocated)
+                r = btracker_issue(mq->bg_work, result);
-                return 0;
+        }
+        spin_unlock_irqrestore(&mq->lock, flags);
-        return e->level;
+        return r;
 }
-static void __remove_mapping(struct smq_policy *mq, dm_oblock_t oblock)
+/*
-{
+ * We need to clear any pending work flags that have been set, and in the
-        struct entry *e;
+ * case of promotion free the entry for the destination cblock.
+ */
+static void __complete_background_work(struct smq_policy *mq,
+                                       struct policy_work *work,
+                                       bool success)
+{
+        struct entry *e = get_entry(&mq->cache_alloc,
+                                    from_cblock(work->cblock));
+        switch (work->op) {
+        case POLICY_PROMOTE:
+                // !h, !q, a
+                clear_pending(mq, e);
+                if (success) {
+                        e->oblock = work->oblock;
+                        push(mq, e);
+                        // h, q, a
+                } else {
+                        free_entry(&mq->cache_alloc, e);
+                        // !h, !q, !a
+                }
+                break;
-        e = h_lookup(&mq->table, oblock);
+        case POLICY_DEMOTE:
-        BUG_ON(!e);
+                // h, !q, a
+                if (success) {
+                        h_remove(&mq->table, e);
+                        free_entry(&mq->cache_alloc, e);
+                        // !h, !q, !a
+                } else {
+                        clear_pending(mq, e);
+                        push_queue(mq, e);
+                        // h, q, a
+                }
+                break;
-        del(mq, e);
+        case POLICY_WRITEBACK:
-        free_entry(&mq->cache_alloc, e);
+                // h, !q, a
+                clear_pending(mq, e);
+                push_queue(mq, e);
+                // h, q, a
+                break;
+        }
+        btracker_complete(mq->bg_work, work);
 }
-static void smq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
+static void smq_complete_background_work(struct dm_cache_policy *p,
+                                         struct policy_work *work,
+                                         bool success)
 {
-        struct smq_policy *mq = to_smq_policy(p);
        unsigned long flags;
+        struct smq_policy *mq = to_smq_policy(p);
        spin_lock_irqsave(&mq->lock, flags);
-        __remove_mapping(mq, oblock);
+        __complete_background_work(mq, work, success);
        spin_unlock_irqrestore(&mq->lock, flags);
 }
-static int __remove_cblock(struct smq_policy *mq, dm_cblock_t cblock)
+// in_hash(oblock) -> in_hash(oblock)
+static void __smq_set_clear_dirty(struct smq_policy *mq, dm_cblock_t cblock, bool set)
 {
        struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
-        if (!e || !e->allocated)
+        if (e->pending_work)
-                return -ENODATA;
+                e->dirty = set;
+        else {
-        del(mq, e);
+                del_queue(mq, e);
-        free_entry(&mq->cache_alloc, e);
+                e->dirty = set;
+                push_queue(mq, e);
-        return 0;
+        }
 }
-static int smq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
+static void smq_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
 {
-        int r;
        unsigned long flags;
        struct smq_policy *mq = to_smq_policy(p);
        spin_lock_irqsave(&mq->lock, flags);
-        r = __remove_cblock(mq, cblock);
+        __smq_set_clear_dirty(mq, cblock, true);
        spin_unlock_irqrestore(&mq->lock, flags);
-        return r;
 }
+static void smq_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
-#define CLEAN_TARGET_CRITICAL 5u /* percent */
-static bool clean_target_met(struct smq_policy *mq, bool critical)
 {
-        if (critical) {
+        struct smq_policy *mq = to_smq_policy(p);
-                /*
+        unsigned long flags;
-                 * Cache entries may not be populated.  So we're cannot rely on the
-                 * size of the clean queue.
-                 */
-                unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty);
-                unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_CRITICAL / 100u;
-                return nr_clean >= target;
+        spin_lock_irqsave(&mq->lock, flags);
-        } else
+        __smq_set_clear_dirty(mq, cblock, false);
-                return !q_size(&mq->dirty);
+        spin_unlock_irqrestore(&mq->lock, flags);
 }
-static int __smq_writeback_work(struct smq_policy *mq, dm_oblock_t *oblock,
+static unsigned random_level(dm_cblock_t cblock)
-                                dm_cblock_t *cblock, bool critical_only)
 {
-        struct entry *e = NULL;
+        return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1);
-        bool target_met = clean_target_met(mq, critical_only);
+}
-        if (critical_only)
-                /*
-                 * Always try and keep the bottom level clean.
-                 */
-                e = pop_old(mq, &mq->dirty, target_met ? 1u : mq->dirty.nr_levels);
-        else
+static int smq_load_mapping(struct dm_cache_policy *p,
-                e = pop_old(mq, &mq->dirty, mq->dirty.nr_levels);
+                            dm_oblock_t oblock, dm_cblock_t cblock,
+                            bool dirty, uint32_t hint, bool hint_valid)
+{
+        struct smq_policy *mq = to_smq_policy(p);
+        struct entry *e;
-        if (!e)
+        e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
-                return -ENODATA;
+        e->oblock = oblock;
+        e->dirty = dirty;
+        e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock);
+        e->pending_work = false;
-        *oblock = e->oblock;
+        /*
-        *cblock = infer_cblock(mq, e);
+         * When we load mappings we push ahead of both sentinels in order to
-        e->dirty = false;
+         * allow demotions and cleaning to occur immediately.
-        push_new(mq, e);
+         */
+        push_front(mq, e);
        return 0;
 }
-static int smq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
+static int smq_invalidate_mapping(struct dm_cache_policy *p, dm_cblock_t cblock)
-                              dm_cblock_t *cblock, bool critical_only)
 {
-        int r;
-        unsigned long flags;
        struct smq_policy *mq = to_smq_policy(p);
+        struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
-        spin_lock_irqsave(&mq->lock, flags);
+        if (!e->allocated)
-        r = __smq_writeback_work(mq, oblock, cblock, critical_only);
+                return -ENODATA;
-        spin_unlock_irqrestore(&mq->lock, flags);
-        return r;
-}
-static void __force_mapping(struct smq_policy *mq,
-                            dm_oblock_t current_oblock, dm_oblock_t new_oblock)
-{
-        struct entry *e = h_lookup(&mq->table, current_oblock);
-        if (e) {
+        // FIXME: what if this block has pending background work?
-                del(mq, e);
+        del_queue(mq, e);
-                e->oblock = new_oblock;
+        h_remove(&mq->table, e);
-                e->dirty = true;
+        free_entry(&mq->cache_alloc, e);
-                push(mq, e);
+        return 0;
-        }
 }
-static void smq_force_mapping(struct dm_cache_policy *p,
+static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock)
-                              dm_oblock_t current_oblock, dm_oblock_t new_oblock)
 {
-        unsigned long flags;
        struct smq_policy *mq = to_smq_policy(p);
+        struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
-        spin_lock_irqsave(&mq->lock, flags);
+        if (!e->allocated)
-        __force_mapping(mq, current_oblock, new_oblock);
+                return 0;
-        spin_unlock_irqrestore(&mq->lock, flags);
+        return e->level;
 }
 static dm_cblock_t smq_residency(struct dm_cache_policy *p)
@@ -1546,6 +1603,12 @@ static void smq_tick(struct dm_cache_policy *p, bool can_block)
        spin_unlock_irqrestore(&mq->lock, flags);
 }
+static void smq_allow_migrations(struct dm_cache_policy *p, bool allow)
+{
+        struct smq_policy *mq = to_smq_policy(p);
+        mq->migrations_allowed = allow;
+}
 /*
 * smq has no config values, but the old mq policy did.  To avoid breaking
 * software we continue to accept these configurables for the mq policy,
@@ -1590,18 +1653,18 @@ static int mq_emit_config_values(struct dm_cache_policy *p, char *result,
 static void init_policy_functions(struct smq_policy *mq, bool mimic_mq)
 {
        mq->policy.destroy = smq_destroy;
-        mq->policy.map = smq_map;
        mq->policy.lookup = smq_lookup;
+        mq->policy.lookup_with_work = smq_lookup_with_work;
+        mq->policy.get_background_work = smq_get_background_work;
+        mq->policy.complete_background_work = smq_complete_background_work;
        mq->policy.set_dirty = smq_set_dirty;
        mq->policy.clear_dirty = smq_clear_dirty;
        mq->policy.load_mapping = smq_load_mapping;
+        mq->policy.invalidate_mapping = smq_invalidate_mapping;
        mq->policy.get_hint = smq_get_hint;
-        mq->policy.remove_mapping = smq_remove_mapping;
-        mq->policy.remove_cblock = smq_remove_cblock;
-        mq->policy.writeback_work = smq_writeback_work;
-        mq->policy.force_mapping = smq_force_mapping;
        mq->policy.residency = smq_residency;
        mq->policy.tick = smq_tick;
+        mq->policy.allow_migrations = smq_allow_migrations;
        if (mimic_mq) {
                mq->policy.set_config_value = mq_set_config_value;
@@ -1633,7 +1696,8 @@ static void calc_hotspot_params(sector_t origin_size,
 static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
                                            sector_t origin_size,
                                            sector_t cache_block_size,
-                                            bool mimic_mq)
+                                            bool mimic_mq,
+                                            bool migrations_allowed)
 {
        unsigned i;
        unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
@@ -1658,11 +1722,11 @@ static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
        }
        init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue);
-        for (i = 0; i < nr_sentinels_per_queue; i++)
+        for (i = 0; i < nr_sentinels_per_queue; i++)
                get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true;
        init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels);
-        for (i = 0; i < nr_sentinels_per_queue; i++)
+        for (i = 0; i < nr_sentinels_per_queue; i++)
                get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true;
        init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels,
@@ -1715,8 +1779,16 @@ static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
        mq->next_hotspot_period = jiffies;
        mq->next_cache_period = jiffies;
+        mq->bg_work = btracker_create(10240); /* FIXME: hard coded value */
+        if (!mq->bg_work)
+                goto bad_btracker;
+        mq->migrations_allowed = migrations_allowed;
        return &mq->policy;
+bad_btracker:
+        h_exit(&mq->hotspot_table);
 bad_alloc_hotspot_table:
        h_exit(&mq->table);
 bad_alloc_table:
@@ -1735,21 +1807,28 @@ static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
                                          sector_t origin_size,
                                          sector_t cache_block_size)
 {
-        return __smq_create(cache_size, origin_size, cache_block_size, false);
+        return __smq_create(cache_size, origin_size, cache_block_size, false, true);
 }
 static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
                                         sector_t origin_size,
                                         sector_t cache_block_size)
 {
-        return __smq_create(cache_size, origin_size, cache_block_size, true);
+        return __smq_create(cache_size, origin_size, cache_block_size, true, true);
+}
+static struct dm_cache_policy *cleaner_create(dm_cblock_t cache_size,
+                                              sector_t origin_size,
+                                              sector_t cache_block_size)
+{
+        return __smq_create(cache_size, origin_size, cache_block_size, false, false);
 }
 /*----------------------------------------------------------------*/
 static struct dm_cache_policy_type smq_policy_type = {
        .name = "smq",
-        .version = {1, 5, 0},
+        .version = {2, 0, 0},
        .hint_size = 4,
        .owner = THIS_MODULE,
        .create = smq_create
@@ -1757,15 +1836,23 @@ static struct dm_cache_policy_type smq_policy_type = {
 static struct dm_cache_policy_type mq_policy_type = {
        .name = "mq",
-        .version = {1, 5, 0},
+        .version = {2, 0, 0},
        .hint_size = 4,
        .owner = THIS_MODULE,
        .create = mq_create,
 };
+static struct dm_cache_policy_type cleaner_policy_type = {
+        .name = "cleaner",
+        .version = {2, 0, 0},
+        .hint_size = 4,
+        .owner = THIS_MODULE,
+        .create = cleaner_create,
+};
 static struct dm_cache_policy_type default_policy_type = {
        .name = "default",
-        .version = {1, 5, 0},
+        .version = {2, 0, 0},
        .hint_size = 4,
        .owner = THIS_MODULE,
        .create = smq_create,
@@ -1785,23 +1872,36 @@ static int __init smq_init(void)
        r = dm_cache_policy_register(&mq_policy_type);
        if (r) {
                DMERR("register failed (as mq) %d", r);
-                dm_cache_policy_unregister(&smq_policy_type);
+                goto out_mq;
-                return -ENOMEM;
+        }
+        r = dm_cache_policy_register(&cleaner_policy_type);
+        if (r) {
+                DMERR("register failed (as cleaner) %d", r);
+                goto out_cleaner;
        }
        r = dm_cache_policy_register(&default_policy_type);
        if (r) {
                DMERR("register failed (as default) %d", r);
-                dm_cache_policy_unregister(&mq_policy_type);
+                goto out_default;
-                dm_cache_policy_unregister(&smq_policy_type);
-                return -ENOMEM;
        }
        return 0;
+out_default:
+        dm_cache_policy_unregister(&cleaner_policy_type);
+out_cleaner:
+        dm_cache_policy_unregister(&mq_policy_type);
+out_mq:
+        dm_cache_policy_unregister(&smq_policy_type);
+        return -ENOMEM;
 }
 static void __exit smq_exit(void)
 {
+        dm_cache_policy_unregister(&cleaner_policy_type);
        dm_cache_policy_unregister(&smq_policy_type);
        dm_cache_policy_unregister(&mq_policy_type);
        dm_cache_policy_unregister(&default_policy_type);
@@ -1816,3 +1916,4 @@ MODULE_DESCRIPTION("smq cache policy");
 MODULE_ALIAS("dm-cache-default");
 MODULE_ALIAS("dm-cache-mq");
+MODULE_ALIAS("dm-cache-cleaner");
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
index aa10b1493f34..c05fc3436cef 100644
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@@ -13,183 +13,100 @@
 /*----------------------------------------------------------------*/
-/* FIXME: make it clear which methods are optional.  Get debug policy to
- * double check this at start.
- */
 /*
 * The cache policy makes the important decisions about which blocks get to
 * live on the faster cache device.
- *
- * When the core target has to remap a bio it calls the 'map' method of the
- * policy.  This returns an instruction telling the core target what to do.
- *
- * POLICY_HIT:
- *   That block is in the cache.  Remap to the cache and carry on.
- *
- * POLICY_MISS:
- *   This block is on the origin device.  Remap and carry on.
- *
- * POLICY_NEW:
- *   This block is currently on the origin device, but the policy wants to
- *   move it.  The core should:
- *
- *   - hold any further io to this origin block
- *   - copy the origin to the given cache block
- *   - release all the held blocks
- *   - remap the original block to the cache
- *
- * POLICY_REPLACE:
- *   This block is currently on the origin device.  The policy wants to
- *   move it to the cache, with the added complication that the destination
- *   cache block needs a writeback first.  The core should:
- *
- *   - hold any further io to this origin block
- *   - hold any further io to the origin block that's being written back
- *   - writeback
- *   - copy new block to cache
- *   - release held blocks
- *   - remap bio to cache and reissue.
- *
- * Should the core run into trouble while processing a POLICY_NEW or
- * POLICY_REPLACE instruction it will roll back the policies mapping using
- * remove_mapping() or force_mapping().  These methods must not fail.  This
- * approach avoids having transactional semantics in the policy (ie, the
- * core informing the policy when a migration is complete), and hence makes
- * it easier to write new policies.
- *
- * In general policy methods should never block, except in the case of the
- * map function when can_migrate is set.  So be careful to implement using
- * bounded, preallocated memory.
 */
 enum policy_operation {
-        POLICY_HIT,
+        POLICY_PROMOTE,
-        POLICY_MISS,
+        POLICY_DEMOTE,
-        POLICY_NEW,
+        POLICY_WRITEBACK
-        POLICY_REPLACE
-};
-/*
- * When issuing a POLICY_REPLACE the policy needs to make a callback to
- * lock the block being demoted.  This doesn't need to occur during a
- * writeback operation since the block remains in the cache.
- */
-struct policy_locker;
-typedef int (*policy_lock_fn)(struct policy_locker *l, dm_oblock_t oblock);
-struct policy_locker {
-        policy_lock_fn fn;
 };
 /*
 * This is the instruction passed back to the core target.
 */
-struct policy_result {
+struct policy_work {
        enum policy_operation op;
-        dm_oblock_t old_oblock; /* POLICY_REPLACE */
+        dm_oblock_t oblock;
-        dm_cblock_t cblock;     /* POLICY_HIT, POLICY_NEW, POLICY_REPLACE */
+        dm_cblock_t cblock;
 };
 /*
- * The cache policy object.  Just a bunch of methods.  It is envisaged that
+ * The cache policy object.  It is envisaged that this structure will be
- * this structure will be embedded in a bigger, policy specific structure
+ * embedded in a bigger, policy specific structure (ie. use container_of()).
- * (ie. use container_of()).
 */
 struct dm_cache_policy {
-        /*
-         * FIXME: make it clear which methods are optional, and which may
-         * block.
-         */
        /*
         * Destroys this object.
         */
        void (*destroy)(struct dm_cache_policy *p);
        /*
-         * See large comment above.
+         * Find the location of a block.
-         *
-         * oblock      - the origin block we're interested in.
-         *
-         * can_block - indicates whether the current thread is allowed to
-         *             block.  -EWOULDBLOCK returned if it can't and would.
-         *
-         * can_migrate - gives permission for POLICY_NEW or POLICY_REPLACE
-         *               instructions.  If denied and the policy would have
-         *               returned one of these instructions it should
-         *               return -EWOULDBLOCK.
         *
-         * discarded_oblock - indicates whether the whole origin block is
+         * Must not block.
-         *               in a discarded state (FIXME: better to tell the
-         *               policy about this sooner, so it can recycle that
-         *               cache block if it wants.)
-         * bio         - the bio that triggered this call.
-         * result      - gets filled in with the instruction.
         *
-         * May only return 0, or -EWOULDBLOCK (if !can_migrate)
+         * Returns 0 if in cache (cblock will be set), -ENOENT if not, < 0 for
+         * other errors (-EWOULDBLOCK would be typical).  data_dir should be
+         * READ or WRITE. fast_copy should be set if migrating this block would
+         * be 'cheap' somehow (eg, discarded data). background_queued will be set
+         * if a migration has just been queued.
         */
-        int (*map)(struct dm_cache_policy *p, dm_oblock_t oblock,
+        int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
-                   bool can_block, bool can_migrate, bool discarded_oblock,
+                      int data_dir, bool fast_copy, bool *background_queued);
-                   struct bio *bio, struct policy_locker *locker,
-                   struct policy_result *result);
        /*
-         * Sometimes we want to see if a block is in the cache, without
+         * Sometimes the core target can optimise a migration, eg, the
-         * triggering any update of stats.  (ie. it's not a real hit).
+         * block may be discarded, or the bio may cover an entire block.
-         *
+         * In order to optimise it needs the migration immediately though
-         * Must not block.
+         * so it knows to do something different with the bio.
         *
-         * Returns 0 if in cache, -ENOENT if not, < 0 for other errors
+         * This method is optional (policy-internal will fallback to using
-         * (-EWOULDBLOCK would be typical).
+         * lookup).
         */
-        int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock);
+        int (*lookup_with_work)(struct dm_cache_policy *p,
+                                dm_oblock_t oblock, dm_cblock_t *cblock,
-        void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
+                                int data_dir, bool fast_copy,
-        void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
+                                struct policy_work **work);
        /*
-         * Called when a cache target is first created.  Used to load a
+         * Retrieves background work.  Returns -ENODATA when there's no
-         * mapping from the metadata device into the policy.
+         * background work.
         */
-        int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock,
+        int (*get_background_work)(struct dm_cache_policy *p, bool idle,
-                            dm_cblock_t cblock, uint32_t hint, bool hint_valid);
+                                   struct policy_work **result);
        /*
-         * Gets the hint for a given cblock.  Called in a single threaded
+         * You must pass in the same work pointer that you were given, not
-         * context.  So no locking required.
+         * a copy.
         */
-        uint32_t (*get_hint)(struct dm_cache_policy *p, dm_cblock_t cblock);
+        void (*complete_background_work)(struct dm_cache_policy *p,
+                                         struct policy_work *work,
+                                         bool success);
+        void (*set_dirty)(struct dm_cache_policy *p, dm_cblock_t cblock);
+        void (*clear_dirty)(struct dm_cache_policy *p, dm_cblock_t cblock);
        /*
-         * Override functions used on the error paths of the core target.
+         * Called when a cache target is first created.  Used to load a
-         * They must succeed.
+         * mapping from the metadata device into the policy.
         */
-        void (*remove_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock);
+        int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock,
-        void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock,
+                            dm_cblock_t cblock, bool dirty,
-                              dm_oblock_t new_oblock);
+                            uint32_t hint, bool hint_valid);
        /*
-         * This is called via the invalidate_cblocks message.  It is
+         * Drops the mapping, irrespective of whether it's clean or dirty.
-         * possible the particular cblock has already been removed due to a
+         * Returns -ENODATA if cblock is not mapped.
-         * write io in passthrough mode.  In which case this should return
-         * -ENODATA.
         */
-        int (*remove_cblock)(struct dm_cache_policy *p, dm_cblock_t cblock);
+        int (*invalidate_mapping)(struct dm_cache_policy *p, dm_cblock_t cblock);
        /*
-         * Provide a dirty block to be written back by the core target.  If
+         * Gets the hint for a given cblock.  Called in a single threaded
-         * critical_only is set then the policy should only provide work if
+         * context.  So no locking required.
-         * it urgently needs it.
-         *
-         * Returns:
-         *
-         * 0 and @cblock,@oblock: block to write back provided
-         *
-         * -ENODATA: no dirty blocks available
         */
-        int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock,
+        uint32_t (*get_hint)(struct dm_cache_policy *p, dm_cblock_t cblock);
-                              bool critical_only);
        /*
         * How full is the cache?
@@ -202,6 +119,8 @@ struct dm_cache_policy {
         * queue merging has occurred).  To stop the policy being fooled by
         * these, the core target sends regular tick() calls to the policy.
         * The policy should only count an entry as hit once per tick.
+         *
+         * This method is optional.
         */
        void (*tick)(struct dm_cache_policy *p, bool can_block);
@@ -213,6 +132,8 @@ struct dm_cache_policy {
        int (*set_config_value)(struct dm_cache_policy *p,
                                const char *key, const char *value);
+        void (*allow_migrations)(struct dm_cache_policy *p, bool allow);
        /*
         * Book keeping ptr for the policy register, not for general use.
         */
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 2eaa414e1509..b7de289a10bb 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -5,7 +5,7 @@
 */
 #include "dm.h"
-#include "dm-bio-prison-v1.h"
+#include "dm-bio-prison-v2.h"
 #include "dm-bio-record.h"
 #include "dm-cache-metadata.h"
@@ -15,6 +15,7 @@
 #include <linux/init.h>
 #include <linux/mempool.h>
 #include <linux/module.h>
+#include <linux/rwsem.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
@@ -25,7 +26,18 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
 /*----------------------------------------------------------------*/
-#define IOT_RESOLUTION 4
+/*
+ * Glossary:
+ *
+ * oblock: index of an origin block
+ * cblock: index of a cache block
+ * promotion: movement of a block from origin to cache
+ * demotion: movement of a block from cache to origin
+ * migration: movement of a block between the origin and cache device,
+ *            either direction
+ */
+/*----------------------------------------------------------------*/
 struct io_tracker {
        spinlock_t lock;
@@ -99,19 +111,178 @@ static void iot_io_end(struct io_tracker *iot, sector_t len)
 /*----------------------------------------------------------------*/
 /*
- * Glossary:
+ * Represents a chunk of future work.  'input' allows continuations to pass
- *
+ * values between themselves, typically error values.
- * oblock: index of an origin block
- * cblock: index of a cache block
- * promotion: movement of a block from origin to cache
- * demotion: movement of a block from cache to origin
- * migration: movement of a block between the origin and cache device,
- *            either direction
 */
+struct continuation {
+        struct work_struct ws;
+        int input;
+};
+static inline void init_continuation(struct continuation *k,
+                                     void (*fn)(struct work_struct *))
+{
+        INIT_WORK(&k->ws, fn);
+        k->input = 0;
+}
+static inline void queue_continuation(struct workqueue_struct *wq,
+                                      struct continuation *k)
+{
+        queue_work(wq, &k->ws);
+}
 /*----------------------------------------------------------------*/
 /*
+ * The batcher collects together pieces of work that need a particular
+ * operation to occur before they can proceed (typically a commit).
+ */
+struct batcher {
+        /*
+         * The operation that everyone is waiting for.
+         */
+        int (*commit_op)(void *context);
+        void *commit_context;
+        /*
+         * This is how bios should be issued once the commit op is complete
+         * (accounted_request).
+         */
+        void (*issue_op)(struct bio *bio, void *context);
+        void *issue_context;
+        /*
+         * Queued work gets put on here after commit.
+         */
+        struct workqueue_struct *wq;
+        spinlock_t lock;
+        struct list_head work_items;
+        struct bio_list bios;
+        struct work_struct commit_work;
+        bool commit_scheduled;
+};
+static void __commit(struct work_struct *_ws)
+{
+        struct batcher *b = container_of(_ws, struct batcher, commit_work);
+        int r;
+        unsigned long flags;
+        struct list_head work_items;
+        struct work_struct *ws, *tmp;
+        struct continuation *k;
+        struct bio *bio;
+        struct bio_list bios;
+        INIT_LIST_HEAD(&work_items);
+        bio_list_init(&bios);
+        /*
+         * We have to grab these before the commit_op to avoid a race
+         * condition.
+         */
+        spin_lock_irqsave(&b->lock, flags);
+        list_splice_init(&b->work_items, &work_items);
+        bio_list_merge(&bios, &b->bios);
+        bio_list_init(&b->bios);
+        b->commit_scheduled = false;
+        spin_unlock_irqrestore(&b->lock, flags);
+        r = b->commit_op(b->commit_context);
+        list_for_each_entry_safe(ws, tmp, &work_items, entry) {
+                k = container_of(ws, struct continuation, ws);
+                k->input = r;
+                INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */
+                queue_work(b->wq, ws);
+        }
+        while ((bio = bio_list_pop(&bios))) {
+                if (r) {
+                        bio->bi_error = r;
+                        bio_endio(bio);
+                } else
+                        b->issue_op(bio, b->issue_context);
+        }
+}
+static void batcher_init(struct batcher *b,
+                         int (*commit_op)(void *),
+                         void *commit_context,
+                         void (*issue_op)(struct bio *bio, void *),
+                         void *issue_context,
+                         struct workqueue_struct *wq)
+{
+        b->commit_op = commit_op;
+        b->commit_context = commit_context;
+        b->issue_op = issue_op;
+        b->issue_context = issue_context;
+        b->wq = wq;
+        spin_lock_init(&b->lock);
+        INIT_LIST_HEAD(&b->work_items);
+        bio_list_init(&b->bios);
+        INIT_WORK(&b->commit_work, __commit);
+        b->commit_scheduled = false;
+}
+static void async_commit(struct batcher *b)
+{
+        queue_work(b->wq, &b->commit_work);
+}
+static void continue_after_commit(struct batcher *b, struct continuation *k)
+{
+        unsigned long flags;
+        bool commit_scheduled;
+        spin_lock_irqsave(&b->lock, flags);
+        commit_scheduled = b->commit_scheduled;
+        list_add_tail(&k->ws.entry, &b->work_items);
+        spin_unlock_irqrestore(&b->lock, flags);
+        if (commit_scheduled)
+                async_commit(b);
+}
+/*
+ * Bios are errored if commit failed.
+ */
+static void issue_after_commit(struct batcher *b, struct bio *bio)
+{
+       unsigned long flags;
+       bool commit_scheduled;
+       spin_lock_irqsave(&b->lock, flags);
+       commit_scheduled = b->commit_scheduled;
+       bio_list_add(&b->bios, bio);
+       spin_unlock_irqrestore(&b->lock, flags);
+       if (commit_scheduled)
+               async_commit(b);
+}
+/*
+ * Call this if some urgent work is waiting for the commit to complete.
+ */
+static void schedule_commit(struct batcher *b)
+{
+        bool immediate;
+        unsigned long flags;
+        spin_lock_irqsave(&b->lock, flags);
+        immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios);
+        b->commit_scheduled = true;
+        spin_unlock_irqrestore(&b->lock, flags);
+        if (immediate)
+                async_commit(b);
+}
+/*
 * There are a couple of places where we let a bio run, but want to do some
 * work before calling its endio function.  We do this by temporarily
 * changing the endio fn.
@@ -189,31 +360,13 @@ struct cache_stats {
        atomic_t write_miss;
        atomic_t demotion;
        atomic_t promotion;
+        atomic_t writeback;
        atomic_t copies_avoided;
        atomic_t cache_cell_clash;
        atomic_t commit_count;
        atomic_t discard_count;
 };
-/*
- * Defines a range of cblocks, begin to (end - 1) are in the range.  end is
- * the one-past-the-end value.
- */
-struct cblock_range {
-        dm_cblock_t begin;
-        dm_cblock_t end;
-};
-struct invalidation_request {
-        struct list_head list;
-        struct cblock_range *cblocks;
-        atomic_t complete;
-        int err;
-        wait_queue_head_t result_wait;
-};
 struct cache {
        struct dm_target *ti;
        struct dm_target_callbacks callbacks;
@@ -255,11 +408,7 @@ struct cache {
        spinlock_t lock;
        struct list_head deferred_cells;
        struct bio_list deferred_bios;
-        struct bio_list deferred_flush_bios;
        struct bio_list deferred_writethrough_bios;
-        struct list_head quiesced_migrations;
-        struct list_head completed_migrations;
-        struct list_head need_commit_migrations;
        sector_t migration_threshold;
        wait_queue_head_t migration_wait;
        atomic_t nr_allocated_migrations;
@@ -270,9 +419,7 @@ struct cache {
         */
        atomic_t nr_io_migrations;
-        wait_queue_head_t quiescing_wait;
+        struct rw_semaphore quiesce_lock;
-        atomic_t quiescing;
-        atomic_t quiescing_ack;
        /*
         * cache_size entries, dirty if set
@@ -296,13 +443,11 @@ struct cache {
        struct dm_kcopyd_client *copier;
        struct workqueue_struct *wq;
-        struct work_struct worker;
+        struct work_struct deferred_bio_worker;
+        struct work_struct deferred_writethrough_worker;
+        struct work_struct migration_worker;
        struct delayed_work waker;
-        unsigned long last_commit_jiffies;
+        struct dm_bio_prison_v2 *prison;
-        struct dm_bio_prison *prison;
-        struct dm_deferred_set *all_io_ds;
        mempool_t *migration_pool;
@@ -330,12 +475,17 @@ struct cache {
        struct list_head invalidation_requests;
        struct io_tracker origin_tracker;
+        struct work_struct commit_ws;
+        struct batcher committer;
+        struct rw_semaphore background_work_lock;
 };
 struct per_bio_data {
        bool tick:1;
        unsigned req_nr:2;
-        struct dm_deferred_entry *all_io_entry;
+        struct dm_bio_prison_cell_v2 *cell;
        struct dm_hook_info hook_info;
        sector_t len;
@@ -350,55 +500,64 @@ struct per_bio_data {
 };
 struct dm_cache_migration {
-        struct list_head list;
+        struct continuation k;
        struct cache *cache;
-        unsigned long start_jiffies;
+        struct policy_work *op;
-        dm_oblock_t old_oblock;
+        struct bio *overwrite_bio;
-        dm_oblock_t new_oblock;
+        struct dm_bio_prison_cell_v2 *cell;
-        dm_cblock_t cblock;
-        bool err:1;
-        bool discard:1;
-        bool writeback:1;
-        bool demote:1;
-        bool promote:1;
-        bool requeue_holder:1;
-        bool invalidate:1;
-        struct dm_bio_prison_cell *old_ocell;
+        dm_cblock_t invalidate_cblock;
-        struct dm_bio_prison_cell *new_ocell;
+        dm_oblock_t invalidate_oblock;
 };
-/*
+/*----------------------------------------------------------------*/
- * Processing a bio in the worker thread may require these memory
- * allocations.  We prealloc to avoid deadlocks (the same worker thread
+static bool writethrough_mode(struct cache_features *f)
- * frees them back to the mempool).
+{
- */
+        return f->io_mode == CM_IO_WRITETHROUGH;
-struct prealloc {
+}
-        struct dm_cache_migration *mg;
-        struct dm_bio_prison_cell *cell1;
-        struct dm_bio_prison_cell *cell2;
-};
-static enum cache_metadata_mode get_cache_mode(struct cache *cache);
+static bool writeback_mode(struct cache_features *f)
+{
+        return f->io_mode == CM_IO_WRITEBACK;
+}
-static void wake_worker(struct cache *cache)
+static inline bool passthrough_mode(struct cache_features *f)
 {
-        queue_work(cache->wq, &cache->worker);
+        return unlikely(f->io_mode == CM_IO_PASSTHROUGH);
 }
 /*----------------------------------------------------------------*/
-static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
+static void wake_deferred_bio_worker(struct cache *cache)
 {
-        /* FIXME: change to use a local slab. */
+        queue_work(cache->wq, &cache->deferred_bio_worker);
-        return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
 }
-static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
+static void wake_deferred_writethrough_worker(struct cache *cache)
 {
-        dm_bio_prison_free_cell(cache->prison, cell);
+        queue_work(cache->wq, &cache->deferred_writethrough_worker);
+}
+static void wake_migration_worker(struct cache *cache)
+{
+        if (passthrough_mode(&cache->features))
+                return;
+        queue_work(cache->wq, &cache->migration_worker);
+}
+/*----------------------------------------------------------------*/
+static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache)
+{
+        return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOWAIT);
+}
+static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell)
+{
+        dm_bio_prison_free_cell_v2(cache->prison, cell);
 }
 static struct dm_cache_migration *alloc_migration(struct cache *cache)
@@ -424,146 +583,127 @@ static void free_migration(struct dm_cache_migration *mg)
        mempool_free(mg, cache->migration_pool);
 }
-static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
+/*----------------------------------------------------------------*/
-{
-        if (!p->mg) {
-                p->mg = alloc_migration(cache);
-                if (!p->mg)
-                        return -ENOMEM;
-        }
-        if (!p->cell1) {
-                p->cell1 = alloc_prison_cell(cache);
-                if (!p->cell1)
-                        return -ENOMEM;
-        }
-        if (!p->cell2) {
-                p->cell2 = alloc_prison_cell(cache);
-                if (!p->cell2)
-                        return -ENOMEM;
-        }
-        return 0;
-}
-static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
+static inline dm_oblock_t oblock_succ(dm_oblock_t b)
 {
-        if (p->cell2)
+        return to_oblock(from_oblock(b) + 1ull);
-                free_prison_cell(cache, p->cell2);
-        if (p->cell1)
-                free_prison_cell(cache, p->cell1);
-        if (p->mg)
-                free_migration(p->mg);
 }
-static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
+static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key)
 {
-        struct dm_cache_migration *mg = p->mg;
+        key->virtual = 0;
+        key->dev = 0;
-        BUG_ON(!mg);
+        key->block_begin = from_oblock(begin);
-        p->mg = NULL;
+        key->block_end = from_oblock(end);
-        return mg;
 }
 /*
- * You must have a cell within the prealloc struct to return.  If not this
+ * We have two lock levels.  Level 0, which is used to prevent WRITEs, and
- * function will BUG() rather than returning NULL.
+ * level 1 which prevents *both* READs and WRITEs.
 */
-static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
+#define WRITE_LOCK_LEVEL 0
+#define READ_WRITE_LOCK_LEVEL 1
+static unsigned lock_level(struct bio *bio)
 {
-        struct dm_bio_prison_cell *r = NULL;
+        return bio_data_dir(bio) == WRITE ?
+                WRITE_LOCK_LEVEL :
+                READ_WRITE_LOCK_LEVEL;
+}
-        if (p->cell1) {
+/*----------------------------------------------------------------
-                r = p->cell1;
+ * Per bio data
-                p->cell1 = NULL;
+ *--------------------------------------------------------------*/
-        } else if (p->cell2) {
+/*
-                r = p->cell2;
+ * If using writeback, leave out struct per_bio_data's writethrough fields.
-                p->cell2 = NULL;
+ */
-        } else
+#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
-                BUG();
+#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
-        return r;
+static size_t get_per_bio_data_size(struct cache *cache)
+{
+        return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
 }
-/*
+static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
- * You can't have more than two cells in a prealloc struct.  BUG() will be
- * called if you try and overfill.
- */
-static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
 {
-        if (!p->cell2)
+        struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
-                p->cell2 = cell;
+        BUG_ON(!pb);
+        return pb;
+}
-        else if (!p->cell1)
+static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
-                p->cell1 = cell;
+{
+        struct per_bio_data *pb = get_per_bio_data(bio, data_size);
-        else
+        pb->tick = false;
-                BUG();
+        pb->req_nr = dm_bio_get_target_bio_nr(bio);
+        pb->cell = NULL;
+        pb->len = 0;
+        return pb;
 }
 /*----------------------------------------------------------------*/
-static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key)
+static void defer_bio(struct cache *cache, struct bio *bio)
 {
-        key->virtual = 0;
+        unsigned long flags;
-        key->dev = 0;
-        key->block_begin = from_oblock(begin);
-        key->block_end = from_oblock(end);
-}
-/*
+        spin_lock_irqsave(&cache->lock, flags);
- * The caller hands in a preallocated cell, and a free function for it.
+        bio_list_add(&cache->deferred_bios, bio);
- * The cell will be freed if there's an error, or if it wasn't used because
+        spin_unlock_irqrestore(&cache->lock, flags);
- * a cell with that key already exists.
- */
-typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
-static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end,
+        wake_deferred_bio_worker(cache);
-                            struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
+}
-                            cell_free_fn free_fn, void *free_context,
-                            struct dm_bio_prison_cell **cell_result)
+static void defer_bios(struct cache *cache, struct bio_list *bios)
 {
-        int r;
+        unsigned long flags;
-        struct dm_cell_key key;
-        build_key(oblock_begin, oblock_end, &key);
+        spin_lock_irqsave(&cache->lock, flags);
-        r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
+        bio_list_merge(&cache->deferred_bios, bios);
-        if (r)
+        bio_list_init(bios);
-                free_fn(free_context, cell_prealloc);
+        spin_unlock_irqrestore(&cache->lock, flags);
-        return r;
+        wake_deferred_bio_worker(cache);
 }
-static int bio_detain(struct cache *cache, dm_oblock_t oblock,
+/*----------------------------------------------------------------*/
-                      struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
-                      cell_free_fn free_fn, void *free_context,
+static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio)
-                      struct dm_bio_prison_cell **cell_result)
 {
+        bool r;
+        size_t pb_size;
+        struct per_bio_data *pb;
+        struct dm_cell_key_v2 key;
        dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
-        return bio_detain_range(cache, oblock, end, bio,
+        struct dm_bio_prison_cell_v2 *cell_prealloc, *cell;
-                                cell_prealloc, free_fn, free_context, cell_result);
-}
-static int get_cell(struct cache *cache,
+        cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */
-                    dm_oblock_t oblock,
+        if (!cell_prealloc) {
-                    struct prealloc *structs,
+                defer_bio(cache, bio);
-                    struct dm_bio_prison_cell **cell_result)
+                return false;
-{
+        }
-        int r;
-        struct dm_cell_key key;
-        struct dm_bio_prison_cell *cell_prealloc;
-        cell_prealloc = prealloc_get_cell(structs);
+        build_key(oblock, end, &key);
+        r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell);
+        if (!r) {
+                /*
+                 * Failed to get the lock.
+                 */
+                free_prison_cell(cache, cell_prealloc);
+                return r;
+        }
-        build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key);
+        if (cell != cell_prealloc)
-        r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
+                free_prison_cell(cache, cell_prealloc);
-        if (r)
-                prealloc_put_cell(structs, cell_prealloc);
+        pb_size = get_per_bio_data_size(cache);
+        pb = get_per_bio_data(bio, pb_size);
+        pb->cell = cell;
        return r;
 }
@@ -575,21 +715,33 @@ static bool is_dirty(struct cache *cache, dm_cblock_t b)
        return test_bit(from_cblock(b), cache->dirty_bitset);
 }
-static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
+static void set_dirty(struct cache *cache, dm_cblock_t cblock)
 {
        if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
                atomic_inc(&cache->nr_dirty);
-                policy_set_dirty(cache->policy, oblock);
+                policy_set_dirty(cache->policy, cblock);
        }
 }
-static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
+/*
+ * These two are called when setting after migrations to force the policy
+ * and dirty bitset to be in sync.
+ */
+static void force_set_dirty(struct cache *cache, dm_cblock_t cblock)
+{
+        if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset))
+                atomic_inc(&cache->nr_dirty);
+        policy_set_dirty(cache->policy, cblock);
+}
+static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock)
 {
        if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
-                policy_clear_dirty(cache->policy, oblock);
                if (atomic_dec_return(&cache->nr_dirty) == 0)
                        dm_table_event(cache->ti->table);
        }
+        policy_clear_dirty(cache->policy, cblock);
 }
 /*----------------------------------------------------------------*/
@@ -628,11 +780,6 @@ static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
                                   oblocks_per_dblock(cache)));
 }
-static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock)
-{
-        return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache));
-}
 static void set_discard(struct cache *cache, dm_dblock_t b)
 {
        unsigned long flags;
@@ -679,83 +826,6 @@ static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
        return r;
 }
-/*----------------------------------------------------------------*/
-static void load_stats(struct cache *cache)
-{
-        struct dm_cache_statistics stats;
-        dm_cache_metadata_get_stats(cache->cmd, &stats);
-        atomic_set(&cache->stats.read_hit, stats.read_hits);
-        atomic_set(&cache->stats.read_miss, stats.read_misses);
-        atomic_set(&cache->stats.write_hit, stats.write_hits);
-        atomic_set(&cache->stats.write_miss, stats.write_misses);
-}
-static void save_stats(struct cache *cache)
-{
-        struct dm_cache_statistics stats;
-        if (get_cache_mode(cache) >= CM_READ_ONLY)
-                return;
-        stats.read_hits = atomic_read(&cache->stats.read_hit);
-        stats.read_misses = atomic_read(&cache->stats.read_miss);
-        stats.write_hits = atomic_read(&cache->stats.write_hit);
-        stats.write_misses = atomic_read(&cache->stats.write_miss);
-        dm_cache_metadata_set_stats(cache->cmd, &stats);
-}
-/*----------------------------------------------------------------
- * Per bio data
- *--------------------------------------------------------------*/
-/*
- * If using writeback, leave out struct per_bio_data's writethrough fields.
- */
-#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
-#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
-static bool writethrough_mode(struct cache_features *f)
-{
-        return f->io_mode == CM_IO_WRITETHROUGH;
-}
-static bool writeback_mode(struct cache_features *f)
-{
-        return f->io_mode == CM_IO_WRITEBACK;
-}
-static bool passthrough_mode(struct cache_features *f)
-{
-        return f->io_mode == CM_IO_PASSTHROUGH;
-}
-static size_t get_per_bio_data_size(struct cache *cache)
-{
-        return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
-}
-static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
-{
-        struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
-        BUG_ON(!pb);
-        return pb;
-}
-static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
-{
-        struct per_bio_data *pb = get_per_bio_data(bio, data_size);
-        pb->tick = false;
-        pb->req_nr = dm_bio_get_target_bio_nr(bio);
-        pb->all_io_entry = NULL;
-        pb->len = 0;
-        return pb;
-}
 /*----------------------------------------------------------------
 * Remapping
 *--------------------------------------------------------------*/
@@ -797,8 +867,9 @@ static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
 }
 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
-                                  dm_oblock_t oblock)
+                                          dm_oblock_t oblock)
 {
+        // FIXME: this is called way too much.
        check_if_tick_bio_needed(cache, bio);
        remap_to_origin(cache, bio);
        if (bio_data_dir(bio) == WRITE)
@@ -811,7 +882,7 @@ static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
        check_if_tick_bio_needed(cache, bio);
        remap_to_cache(cache, bio, cblock);
        if (bio_data_dir(bio) == WRITE) {
-                set_dirty(cache, oblock, cblock);
+                set_dirty(cache, cblock);
                clear_discard(cache, oblock_to_dblock(cache, oblock));
        }
 }
@@ -828,22 +899,6 @@ static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
        return to_oblock(block_nr);
 }
-/*
- * You must increment the deferred set whilst the prison cell is held.  To
- * encourage this, we ask for 'cell' to be passed in.
- */
-static void inc_ds(struct cache *cache, struct bio *bio,
-                   struct dm_bio_prison_cell *cell)
-{
-        size_t pb_data_size = get_per_bio_data_size(cache);
-        struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
-        BUG_ON(!cell);
-        BUG_ON(pb->all_io_entry);
-        pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
-}
 static bool accountable_bio(struct cache *cache, struct bio *bio)
 {
        return ((bio->bi_bdev == cache->origin_dev->bdev) &&
@@ -875,29 +930,10 @@ static void accounted_request(struct cache *cache, struct bio *bio)
        generic_make_request(bio);
 }
-static void issue(struct cache *cache, struct bio *bio)
+static void issue_op(struct bio *bio, void *context)
 {
-        unsigned long flags;
+        struct cache *cache = context;
+        accounted_request(cache, bio);
-        if (!op_is_flush(bio->bi_opf)) {
-                accounted_request(cache, bio);
-                return;
-        }
-        /*
-         * Batch together any bios that trigger commits and then issue a
-         * single commit for them in do_worker().
-         */
-        spin_lock_irqsave(&cache->lock, flags);
-        cache->commit_requested = true;
-        bio_list_add(&cache->deferred_flush_bios, bio);
-        spin_unlock_irqrestore(&cache->lock, flags);
-}
-static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell)
-{
-        inc_ds(cache, bio, cell);
-        issue(cache, bio);
 }
 static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
@@ -908,7 +944,7 @@ static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
        bio_list_add(&cache->deferred_writethrough_bios, bio);
        spin_unlock_irqrestore(&cache->lock, flags);
-        wake_worker(cache);
+        wake_deferred_writethrough_worker(cache);
 }
 static void writethrough_endio(struct bio *bio)
@@ -934,6 +970,7 @@ static void writethrough_endio(struct bio *bio)
 }
 /*
+ * FIXME: send in parallel, huge latency as is.
 * When running in writethrough mode we need to send writes to clean blocks
 * to both the cache and origin devices.  In future we'd like to clone the
 * bio and send them in parallel, but for now we're doing them in
@@ -1046,12 +1083,58 @@ static void metadata_operation_failed(struct cache *cache, const char *op, int r
        set_cache_mode(cache, CM_READ_ONLY);
 }
+/*----------------------------------------------------------------*/
+static void load_stats(struct cache *cache)
+{
+        struct dm_cache_statistics stats;
+        dm_cache_metadata_get_stats(cache->cmd, &stats);
+        atomic_set(&cache->stats.read_hit, stats.read_hits);
+        atomic_set(&cache->stats.read_miss, stats.read_misses);
+        atomic_set(&cache->stats.write_hit, stats.write_hits);
+        atomic_set(&cache->stats.write_miss, stats.write_misses);
+}
+static void save_stats(struct cache *cache)
+{
+        struct dm_cache_statistics stats;
+        if (get_cache_mode(cache) >= CM_READ_ONLY)
+                return;
+        stats.read_hits = atomic_read(&cache->stats.read_hit);
+        stats.read_misses = atomic_read(&cache->stats.read_miss);
+        stats.write_hits = atomic_read(&cache->stats.write_hit);
+        stats.write_misses = atomic_read(&cache->stats.write_miss);
+        dm_cache_metadata_set_stats(cache->cmd, &stats);
+}
+static void update_stats(struct cache_stats *stats, enum policy_operation op)
+{
+        switch (op) {
+        case POLICY_PROMOTE:
+                atomic_inc(&stats->promotion);
+                break;
+        case POLICY_DEMOTE:
+                atomic_inc(&stats->demotion);
+                break;
+        case POLICY_WRITEBACK:
+                atomic_inc(&stats->writeback);
+                break;
+        }
+}
 /*----------------------------------------------------------------
 * Migration processing
 *
 * Migration covers moving data from the origin device to the cache, or
 * vice versa.
 *--------------------------------------------------------------*/
 static void inc_io_migrations(struct cache *cache)
 {
        atomic_inc(&cache->nr_io_migrations);
@@ -1067,213 +1150,109 @@ static bool discard_or_flush(struct bio *bio)
        return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf);
 }
-static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell)
+static void calc_discard_block_range(struct cache *cache, struct bio *bio,
-{
+                                     dm_dblock_t *b, dm_dblock_t *e)
-        if (discard_or_flush(cell->holder)) {
-                /*
-                 * We have to handle these bios individually.
-                 */
-                dm_cell_release(cache->prison, cell, &cache->deferred_bios);
-                free_prison_cell(cache, cell);
-        } else
-                list_add_tail(&cell->user_list, &cache->deferred_cells);
-}
-static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder)
 {
-        unsigned long flags;
+        sector_t sb = bio->bi_iter.bi_sector;
+        sector_t se = bio_end_sector(bio);
-        if (!holder && dm_cell_promote_or_release(cache->prison, cell)) {
-                /*
-                 * There was no prisoner to promote to holder, the
-                 * cell has been released.
-                 */
-                free_prison_cell(cache, cell);
-                return;
-        }
-        spin_lock_irqsave(&cache->lock, flags);
+        *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
-        __cell_defer(cache, cell);
-        spin_unlock_irqrestore(&cache->lock, flags);
-        wake_worker(cache);
+        if (se - sb < cache->discard_block_size)
+                *e = *b;
+        else
+                *e = to_dblock(block_div(se, cache->discard_block_size));
 }
-static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err)
+/*----------------------------------------------------------------*/
-{
-        dm_cell_error(cache->prison, cell, err);
-        free_prison_cell(cache, cell);
-}
-static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell)
+static void prevent_background_work(struct cache *cache)
 {
-        cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE);
+        lockdep_off();
+        down_write(&cache->background_work_lock);
+        lockdep_on();
 }
-static void free_io_migration(struct dm_cache_migration *mg)
+static void allow_background_work(struct cache *cache)
 {
-        struct cache *cache = mg->cache;
+        lockdep_off();
+        up_write(&cache->background_work_lock);
-        dec_io_migrations(cache);
+        lockdep_on();
-        free_migration(mg);
-        wake_worker(cache);
 }
-static void migration_failure(struct dm_cache_migration *mg)
+static bool background_work_begin(struct cache *cache)
 {
-        struct cache *cache = mg->cache;
+        bool r;
-        const char *dev_name = cache_device_name(cache);
-        if (mg->writeback) {
-                DMERR_LIMIT("%s: writeback failed; couldn't copy block", dev_name);
-                set_dirty(cache, mg->old_oblock, mg->cblock);
-                cell_defer(cache, mg->old_ocell, false);
-        } else if (mg->demote) {
-                DMERR_LIMIT("%s: demotion failed; couldn't copy block", dev_name);
-                policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
-                cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
+        lockdep_off();
-                if (mg->promote)
+        r = down_read_trylock(&cache->background_work_lock);
-                        cell_defer(cache, mg->new_ocell, true);
+        lockdep_on();
-        } else {
-                DMERR_LIMIT("%s: promotion failed; couldn't copy block", dev_name);
-                policy_remove_mapping(cache->policy, mg->new_oblock);
-                cell_defer(cache, mg->new_ocell, true);
-        }
-        free_io_migration(mg);
+        return r;
 }
-static void migration_success_pre_commit(struct dm_cache_migration *mg)
+static void background_work_end(struct cache *cache)
 {
-        int r;
+        lockdep_off();
-        unsigned long flags;
+        up_read(&cache->background_work_lock);
-        struct cache *cache = mg->cache;
+        lockdep_on();
+}
-        if (mg->writeback) {
-                clear_dirty(cache, mg->old_oblock, mg->cblock);
-                cell_defer(cache, mg->old_ocell, false);
-                free_io_migration(mg);
-                return;
-        } else if (mg->demote) {
+/*----------------------------------------------------------------*/
-                r = dm_cache_remove_mapping(cache->cmd, mg->cblock);
-                if (r) {
-                        DMERR_LIMIT("%s: demotion failed; couldn't update on disk metadata",
-                                    cache_device_name(cache));
-                        metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
-                        policy_force_mapping(cache->policy, mg->new_oblock,
-                                             mg->old_oblock);
-                        if (mg->promote)
-                                cell_defer(cache, mg->new_ocell, true);
-                        free_io_migration(mg);
-                        return;
-                }
-        } else {
-                r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock);
-                if (r) {
-                        DMERR_LIMIT("%s: promotion failed; couldn't update on disk metadata",
-                                    cache_device_name(cache));
-                        metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
-                        policy_remove_mapping(cache->policy, mg->new_oblock);
-                        free_io_migration(mg);
-                        return;
-                }
-        }
-        spin_lock_irqsave(&cache->lock, flags);
+static void quiesce(struct dm_cache_migration *mg,
-        list_add_tail(&mg->list, &cache->need_commit_migrations);
+                    void (*continuation)(struct work_struct *))
-        cache->commit_requested = true;
+{
-        spin_unlock_irqrestore(&cache->lock, flags);
+        init_continuation(&mg->k, continuation);
+        dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws);
 }
-static void migration_success_post_commit(struct dm_cache_migration *mg)
+static struct dm_cache_migration *ws_to_mg(struct work_struct *ws)
 {
-        unsigned long flags;
+        struct continuation *k = container_of(ws, struct continuation, ws);
-        struct cache *cache = mg->cache;
+        return container_of(k, struct dm_cache_migration, k);
-        if (mg->writeback) {
-                DMWARN_LIMIT("%s: writeback unexpectedly triggered commit",
-                             cache_device_name(cache));
-                return;
-        } else if (mg->demote) {
-                cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
-                if (mg->promote) {
-                        mg->demote = false;
-                        spin_lock_irqsave(&cache->lock, flags);
-                        list_add_tail(&mg->list, &cache->quiesced_migrations);
-                        spin_unlock_irqrestore(&cache->lock, flags);
-                } else {
-                        if (mg->invalidate)
-                                policy_remove_mapping(cache->policy, mg->old_oblock);
-                        free_io_migration(mg);
-                }
-        } else {
-                if (mg->requeue_holder) {
-                        clear_dirty(cache, mg->new_oblock, mg->cblock);
-                        cell_defer(cache, mg->new_ocell, true);
-                } else {
-                        /*
-                         * The block was promoted via an overwrite, so it's dirty.
-                         */
-                        set_dirty(cache, mg->new_oblock, mg->cblock);
-                        bio_endio(mg->new_ocell->holder);
-                        cell_defer(cache, mg->new_ocell, false);
-                }
-                free_io_migration(mg);
-        }
 }
 static void copy_complete(int read_err, unsigned long write_err, void *context)
 {
-        unsigned long flags;
+        struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k);
-        struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
-        struct cache *cache = mg->cache;
        if (read_err || write_err)
-                mg->err = true;
+                mg->k.input = -EIO;
-        spin_lock_irqsave(&cache->lock, flags);
-        list_add_tail(&mg->list, &cache->completed_migrations);
-        spin_unlock_irqrestore(&cache->lock, flags);
-        wake_worker(cache);
+        queue_continuation(mg->cache->wq, &mg->k);
 }
-static void issue_copy(struct dm_cache_migration *mg)
+static int copy(struct dm_cache_migration *mg, bool promote)
 {
        int r;
        struct dm_io_region o_region, c_region;
        struct cache *cache = mg->cache;
-        sector_t cblock = from_cblock(mg->cblock);
        o_region.bdev = cache->origin_dev->bdev;
+        o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block;
        o_region.count = cache->sectors_per_block;
        c_region.bdev = cache->cache_dev->bdev;
-        c_region.sector = cblock * cache->sectors_per_block;
+        c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block;
        c_region.count = cache->sectors_per_block;
-        if (mg->writeback || mg->demote) {
+        if (promote)
-                /* demote */
+                r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k);
-                o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
+        else
-                r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
+                r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k);
-        } else {
-                /* promote */
-                o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
-                r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
-        }
-        if (r < 0) {
+        return r;
-                DMERR_LIMIT("%s: issuing migration failed", cache_device_name(cache));
+}
-                migration_failure(mg);
-        }
+static void bio_drop_shared_lock(struct cache *cache, struct bio *bio)
+{
+        size_t pb_data_size = get_per_bio_data_size(cache);
+        struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+        if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell))
+                free_prison_cell(cache, pb->cell);
+        pb->cell = NULL;
 }
 static void overwrite_endio(struct bio *bio)
@@ -1282,368 +1261,475 @@ static void overwrite_endio(struct bio *bio)
        struct cache *cache = mg->cache;
        size_t pb_data_size = get_per_bio_data_size(cache);
        struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
-        unsigned long flags;
        dm_unhook_bio(&pb->hook_info, bio);
        if (bio->bi_error)
-                mg->err = true;
+                mg->k.input = bio->bi_error;
-        mg->requeue_holder = false;
+        queue_continuation(mg->cache->wq, &mg->k);
-        spin_lock_irqsave(&cache->lock, flags);
-        list_add_tail(&mg->list, &cache->completed_migrations);
-        spin_unlock_irqrestore(&cache->lock, flags);
-        wake_worker(cache);
 }
-static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
+static void overwrite(struct dm_cache_migration *mg,
+                      void (*continuation)(struct work_struct *))
 {
+        struct bio *bio = mg->overwrite_bio;
        size_t pb_data_size = get_per_bio_data_size(mg->cache);
        struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
        dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
-        remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock);
        /*
-         * No need to inc_ds() here, since the cell will be held for the
+         * The overwrite bio is part of the copy operation, as such it does
-         * duration of the io.
+         * not set/clear discard or dirty flags.
         */
+        if (mg->op->op == POLICY_PROMOTE)
+                remap_to_cache(mg->cache, bio, mg->op->cblock);
+        else
+                remap_to_origin(mg->cache, bio);
+        init_continuation(&mg->k, continuation);
        accounted_request(mg->cache, bio);
 }
-static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
+/*
+ * Migration steps:
+ *
+ * 1) exclusive lock preventing WRITEs
+ * 2) quiesce
+ * 3) copy or issue overwrite bio
+ * 4) upgrade to exclusive lock preventing READs and WRITEs
+ * 5) quiesce
+ * 6) update metadata and commit
+ * 7) unlock
+ */
+static void mg_complete(struct dm_cache_migration *mg, bool success)
 {
-        return (bio_data_dir(bio) == WRITE) &&
+        struct bio_list bios;
-                (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
+        struct cache *cache = mg->cache;
-}
+        struct policy_work *op = mg->op;
+        dm_cblock_t cblock = op->cblock;
+        if (success)
+                update_stats(&cache->stats, op->op);
+        switch (op->op) {
+        case POLICY_PROMOTE:
+                clear_discard(cache, oblock_to_dblock(cache, op->oblock));
+                policy_complete_background_work(cache->policy, op, success);
+                if (mg->overwrite_bio) {
+                        if (success)
+                                force_set_dirty(cache, cblock);
+                        else
+                                mg->overwrite_bio->bi_error = (mg->k.input ? : -EIO);
+                        bio_endio(mg->overwrite_bio);
+                } else {
+                        if (success)
+                                force_clear_dirty(cache, cblock);
+                        dec_io_migrations(cache);
+                }
+                break;
-static void avoid_copy(struct dm_cache_migration *mg)
+        case POLICY_DEMOTE:
-{
+                /*
-        atomic_inc(&mg->cache->stats.copies_avoided);
+                 * We clear dirty here to update the nr_dirty counter.
-        migration_success_pre_commit(mg);
+                 */
-}
+                if (success)
+                        force_clear_dirty(cache, cblock);
+                policy_complete_background_work(cache->policy, op, success);
+                dec_io_migrations(cache);
+                break;
-static void calc_discard_block_range(struct cache *cache, struct bio *bio,
+        case POLICY_WRITEBACK:
-                                     dm_dblock_t *b, dm_dblock_t *e)
+                if (success)
-{
+                        force_clear_dirty(cache, cblock);
-        sector_t sb = bio->bi_iter.bi_sector;
+                policy_complete_background_work(cache->policy, op, success);
-        sector_t se = bio_end_sector(bio);
+                dec_io_migrations(cache);
+                break;
+        }
-        *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
+        bio_list_init(&bios);
+        if (mg->cell) {
+                if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
+                        free_prison_cell(cache, mg->cell);
+        }
-        if (se - sb < cache->discard_block_size)
+        free_migration(mg);
-                *e = *b;
+        defer_bios(cache, &bios);
-        else
+        wake_migration_worker(cache);
-                *e = to_dblock(block_div(se, cache->discard_block_size));
+        background_work_end(cache);
 }
-static void issue_discard(struct dm_cache_migration *mg)
+static void mg_success(struct work_struct *ws)
 {
-        dm_dblock_t b, e;
+        struct dm_cache_migration *mg = ws_to_mg(ws);
-        struct bio *bio = mg->new_ocell->holder;
+        mg_complete(mg, mg->k.input == 0);
-        struct cache *cache = mg->cache;
-        calc_discard_block_range(cache, bio, &b, &e);
-        while (b != e) {
-                set_discard(cache, b);
-                b = to_dblock(from_dblock(b) + 1);
-        }
-        bio_endio(bio);
-        cell_defer(cache, mg->new_ocell, false);
-        free_migration(mg);
-        wake_worker(cache);
 }
-static void issue_copy_or_discard(struct dm_cache_migration *mg)
+static void mg_update_metadata(struct work_struct *ws)
 {
-        bool avoid;
+        int r;
+        struct dm_cache_migration *mg = ws_to_mg(ws);
        struct cache *cache = mg->cache;
+        struct policy_work *op = mg->op;
-        if (mg->discard) {
+        switch (op->op) {
-                issue_discard(mg);
+        case POLICY_PROMOTE:
-                return;
+                r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock);
-        }
+                if (r) {
+                        DMERR_LIMIT("%s: migration failed; couldn't insert mapping",
+                                    cache_device_name(cache));
+                        metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
-        if (mg->writeback || mg->demote)
+                        mg_complete(mg, false);
-                avoid = !is_dirty(cache, mg->cblock) ||
+                        return;
-                        is_discarded_oblock(cache, mg->old_oblock);
+                }
-        else {
+                mg_complete(mg, true);
-                struct bio *bio = mg->new_ocell->holder;
+                break;
-                avoid = is_discarded_oblock(cache, mg->new_oblock);
+        case POLICY_DEMOTE:
+                r = dm_cache_remove_mapping(cache->cmd, op->cblock);
+                if (r) {
+                        DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata",
+                                    cache_device_name(cache));
+                        metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
-                if (writeback_mode(&cache->features) &&
+                        mg_complete(mg, false);
-                    !avoid && bio_writes_complete_block(cache, bio)) {
-                        issue_overwrite(mg, bio);
                        return;
                }
-        }
-        avoid ? avoid_copy(mg) : issue_copy(mg);
+                /*
+                 * It would be nice if we only had to commit when a REQ_FLUSH
+                 * comes through.  But there's one scenario that we have to
+                 * look out for:
+                 *
+                 * - vblock x in a cache block
+                 * - domotion occurs
+                 * - cache block gets reallocated and over written
+                 * - crash
+                 *
+                 * When we recover, because there was no commit the cache will
+                 * rollback to having the data for vblock x in the cache block.
+                 * But the cache block has since been overwritten, so it'll end
+                 * up pointing to data that was never in 'x' during the history
+                 * of the device.
+                 *
+                 * To avoid this issue we require a commit as part of the
+                 * demotion operation.
+                 */
+                init_continuation(&mg->k, mg_success);
+                continue_after_commit(&cache->committer, &mg->k);
+                schedule_commit(&cache->committer);
+                break;
+        case POLICY_WRITEBACK:
+                mg_complete(mg, true);
+                break;
+        }
 }
-static void complete_migration(struct dm_cache_migration *mg)
+static void mg_update_metadata_after_copy(struct work_struct *ws)
 {
-        if (mg->err)
+        struct dm_cache_migration *mg = ws_to_mg(ws);
-                migration_failure(mg);
+        /*
+         * Did the copy succeed?
+         */
+        if (mg->k.input)
+                mg_complete(mg, false);
        else
-                migration_success_pre_commit(mg);
+                mg_update_metadata(ws);
 }
-static void process_migrations(struct cache *cache, struct list_head *head,
+static void mg_upgrade_lock(struct work_struct *ws)
-                               void (*fn)(struct dm_cache_migration *))
 {
-        unsigned long flags;
+        int r;
-        struct list_head list;
+        struct dm_cache_migration *mg = ws_to_mg(ws);
-        struct dm_cache_migration *mg, *tmp;
-        INIT_LIST_HEAD(&list);
+        /*
-        spin_lock_irqsave(&cache->lock, flags);
+         * Did the copy succeed?
-        list_splice_init(head, &list);
+         */
-        spin_unlock_irqrestore(&cache->lock, flags);
+        if (mg->k.input)
+                mg_complete(mg, false);
-        list_for_each_entry_safe(mg, tmp, &list, list)
+        else {
-                fn(mg);
+                /*
-}
+                 * Now we want the lock to prevent both reads and writes.
+                 */
+                r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell,
+                                            READ_WRITE_LOCK_LEVEL);
+                if (r < 0)
+                        mg_complete(mg, false);
-static void __queue_quiesced_migration(struct dm_cache_migration *mg)
+                else if (r)
-{
+                        quiesce(mg, mg_update_metadata);
-        list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
+                else
+                        mg_update_metadata(ws);
+        }
 }
-static void queue_quiesced_migration(struct dm_cache_migration *mg)
+static void mg_copy(struct work_struct *ws)
 {
-        unsigned long flags;
+        int r;
-        struct cache *cache = mg->cache;
+        struct dm_cache_migration *mg = ws_to_mg(ws);
-        spin_lock_irqsave(&cache->lock, flags);
+        if (mg->overwrite_bio) {
-        __queue_quiesced_migration(mg);
+                /*
-        spin_unlock_irqrestore(&cache->lock, flags);
+                 * It's safe to do this here, even though it's new data
+                 * because all IO has been locked out of the block.
+                 *
+                 * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL
+                 * so _not_ using mg_upgrade_lock() as continutation.
+                 */
+                overwrite(mg, mg_update_metadata_after_copy);
-        wake_worker(cache);
+        } else {
-}
+                struct cache *cache = mg->cache;
+                struct policy_work *op = mg->op;
+                bool is_policy_promote = (op->op == POLICY_PROMOTE);
-static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
+                if ((!is_policy_promote && !is_dirty(cache, op->cblock)) ||
-{
+                    is_discarded_oblock(cache, op->oblock)) {
-        unsigned long flags;
+                        mg_upgrade_lock(ws);
-        struct dm_cache_migration *mg, *tmp;
+                        return;
+                }
-        spin_lock_irqsave(&cache->lock, flags);
+                init_continuation(&mg->k, mg_upgrade_lock);
-        list_for_each_entry_safe(mg, tmp, work, list)
-                __queue_quiesced_migration(mg);
-        spin_unlock_irqrestore(&cache->lock, flags);
-        wake_worker(cache);
+                r = copy(mg, is_policy_promote);
+                if (r) {
+                        DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache));
+                        mg->k.input = -EIO;
+                        mg_complete(mg, false);
+                }
+        }
 }
-static void check_for_quiesced_migrations(struct cache *cache,
+static int mg_lock_writes(struct dm_cache_migration *mg)
-                                          struct per_bio_data *pb)
 {
-        struct list_head work;
+        int r;
+        struct dm_cell_key_v2 key;
+        struct cache *cache = mg->cache;
+        struct dm_bio_prison_cell_v2 *prealloc;
-        if (!pb->all_io_entry)
+        prealloc = alloc_prison_cell(cache);
-                return;
+        if (!prealloc) {
+                DMERR_LIMIT("%s: alloc_prison_cell failed", cache_device_name(cache));
+                mg_complete(mg, false);
+                return -ENOMEM;
+        }
+        /*
+         * Prevent writes to the block, but allow reads to continue.
+         * Unless we're using an overwrite bio, in which case we lock
+         * everything.
+         */
+        build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key);
+        r = dm_cell_lock_v2(cache->prison, &key,
+                            mg->overwrite_bio ?  READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL,
+                            prealloc, &mg->cell);
+        if (r < 0) {
+                free_prison_cell(cache, prealloc);
+                mg_complete(mg, false);
+                return r;
+        }
-        INIT_LIST_HEAD(&work);
+        if (mg->cell != prealloc)
-        dm_deferred_entry_dec(pb->all_io_entry, &work);
+                free_prison_cell(cache, prealloc);
-        if (!list_empty(&work))
+        if (r == 0)
-                queue_quiesced_migrations(cache, &work);
+                mg_copy(&mg->k.ws);
-}
+        else
+                quiesce(mg, mg_copy);
-static void quiesce_migration(struct dm_cache_migration *mg)
+        return 0;
-{
-        if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
-                queue_quiesced_migration(mg);
 }
-static void promote(struct cache *cache, struct prealloc *structs,
+static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio)
-                    dm_oblock_t oblock, dm_cblock_t cblock,
-                    struct dm_bio_prison_cell *cell)
 {
-        struct dm_cache_migration *mg = prealloc_get_migration(structs);
+        struct dm_cache_migration *mg;
+        if (!background_work_begin(cache)) {
+                policy_complete_background_work(cache->policy, op, false);
+                return -EPERM;
+        }
+        mg = alloc_migration(cache);
+        if (!mg) {
+                policy_complete_background_work(cache->policy, op, false);
+                background_work_end(cache);
+                return -ENOMEM;
+        }
+        memset(mg, 0, sizeof(*mg));
-        mg->err = false;
-        mg->discard = false;
-        mg->writeback = false;
-        mg->demote = false;
-        mg->promote = true;
-        mg->requeue_holder = true;
-        mg->invalidate = false;
        mg->cache = cache;
-        mg->new_oblock = oblock;
+        mg->op = op;
-        mg->cblock = cblock;
+        mg->overwrite_bio = bio;
-        mg->old_ocell = NULL;
-        mg->new_ocell = cell;
-        mg->start_jiffies = jiffies;
-        inc_io_migrations(cache);
+        if (!bio)
-        quiesce_migration(mg);
+                inc_io_migrations(cache);
+        return mg_lock_writes(mg);
 }
-static void writeback(struct cache *cache, struct prealloc *structs,
+/*----------------------------------------------------------------
-                      dm_oblock_t oblock, dm_cblock_t cblock,
+ * invalidation processing
-                      struct dm_bio_prison_cell *cell)
+ *--------------------------------------------------------------*/
+static void invalidate_complete(struct dm_cache_migration *mg, bool success)
 {
-        struct dm_cache_migration *mg = prealloc_get_migration(structs);
+        struct bio_list bios;
+        struct cache *cache = mg->cache;
-        mg->err = false;
+        bio_list_init(&bios);
-        mg->discard = false;
+        if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
-        mg->writeback = true;
+                free_prison_cell(cache, mg->cell);
-        mg->demote = false;
-        mg->promote = false;
-        mg->requeue_holder = true;
-        mg->invalidate = false;
-        mg->cache = cache;
-        mg->old_oblock = oblock;
-        mg->cblock = cblock;
-        mg->old_ocell = cell;
-        mg->new_ocell = NULL;
-        mg->start_jiffies = jiffies;
-        inc_io_migrations(cache);
-        quiesce_migration(mg);
-}
-static void demote_then_promote(struct cache *cache, struct prealloc *structs,
-                                dm_oblock_t old_oblock, dm_oblock_t new_oblock,
-                                dm_cblock_t cblock,
-                                struct dm_bio_prison_cell *old_ocell,
-                                struct dm_bio_prison_cell *new_ocell)
-{
-        struct dm_cache_migration *mg = prealloc_get_migration(structs);
-        mg->err = false;
-        mg->discard = false;
-        mg->writeback = false;
-        mg->demote = true;
-        mg->promote = true;
-        mg->requeue_holder = true;
-        mg->invalidate = false;
-        mg->cache = cache;
-        mg->old_oblock = old_oblock;
-        mg->new_oblock = new_oblock;
-        mg->cblock = cblock;
-        mg->old_ocell = old_ocell;
-        mg->new_ocell = new_ocell;
-        mg->start_jiffies = jiffies;
-        inc_io_migrations(cache);
+        if (!success && mg->overwrite_bio)
-        quiesce_migration(mg);
+                bio_io_error(mg->overwrite_bio);
-}
-/*
+        free_migration(mg);
- * Invalidate a cache entry.  No writeback occurs; any changes in the cache
+        defer_bios(cache, &bios);
- * block are thrown away.
- */
-static void invalidate(struct cache *cache, struct prealloc *structs,
-                       dm_oblock_t oblock, dm_cblock_t cblock,
-                       struct dm_bio_prison_cell *cell)
-{
-        struct dm_cache_migration *mg = prealloc_get_migration(structs);
-        mg->err = false;
-        mg->discard = false;
-        mg->writeback = false;
-        mg->demote = true;
-        mg->promote = false;
-        mg->requeue_holder = true;
-        mg->invalidate = true;
-        mg->cache = cache;
-        mg->old_oblock = oblock;
-        mg->cblock = cblock;
-        mg->old_ocell = cell;
-        mg->new_ocell = NULL;
-        mg->start_jiffies = jiffies;
-        inc_io_migrations(cache);
+        background_work_end(cache);
-        quiesce_migration(mg);
 }
-static void discard(struct cache *cache, struct prealloc *structs,
+static void invalidate_completed(struct work_struct *ws)
-                    struct dm_bio_prison_cell *cell)
 {
-        struct dm_cache_migration *mg = prealloc_get_migration(structs);
+        struct dm_cache_migration *mg = ws_to_mg(ws);
+        invalidate_complete(mg, !mg->k.input);
+}
-        mg->err = false;
+static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock)
-        mg->discard = true;
+{
-        mg->writeback = false;
+        int r = policy_invalidate_mapping(cache->policy, cblock);
-        mg->demote = false;
+        if (!r) {
-        mg->promote = false;
+                r = dm_cache_remove_mapping(cache->cmd, cblock);
-        mg->requeue_holder = false;
+                if (r) {
-        mg->invalidate = false;
+                        DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata",
-        mg->cache = cache;
+                                    cache_device_name(cache));
-        mg->old_ocell = NULL;
+                        metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
-        mg->new_ocell = cell;
+                }
-        mg->start_jiffies = jiffies;
+        } else if (r == -ENODATA) {
+                /*
+                 * Harmless, already unmapped.
+                 */
+                r = 0;
+        } else
+                DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache));
-        quiesce_migration(mg);
+        return r;
 }
-/*----------------------------------------------------------------
+static void invalidate_remove(struct work_struct *ws)
- * bio processing
- *--------------------------------------------------------------*/
-static void defer_bio(struct cache *cache, struct bio *bio)
 {
-        unsigned long flags;
+        int r;
+        struct dm_cache_migration *mg = ws_to_mg(ws);
+        struct cache *cache = mg->cache;
-        spin_lock_irqsave(&cache->lock, flags);
+        r = invalidate_cblock(cache, mg->invalidate_cblock);
-        bio_list_add(&cache->deferred_bios, bio);
+        if (r) {
-        spin_unlock_irqrestore(&cache->lock, flags);
+                invalidate_complete(mg, false);
+                return;
+        }
-        wake_worker(cache);
+        init_continuation(&mg->k, invalidate_completed);
+        continue_after_commit(&cache->committer, &mg->k);
+        remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock);
+        mg->overwrite_bio = NULL;
+        schedule_commit(&cache->committer);
 }
-static void process_flush_bio(struct cache *cache, struct bio *bio)
+static int invalidate_lock(struct dm_cache_migration *mg)
 {
-        size_t pb_data_size = get_per_bio_data_size(cache);
+        int r;
-        struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+        struct dm_cell_key_v2 key;
+        struct cache *cache = mg->cache;
+        struct dm_bio_prison_cell_v2 *prealloc;
-        BUG_ON(bio->bi_iter.bi_size);
+        prealloc = alloc_prison_cell(cache);
-        if (!pb->req_nr)
+        if (!prealloc) {
-                remap_to_origin(cache, bio);
+                invalidate_complete(mg, false);
-        else
+                return -ENOMEM;
-                remap_to_cache(cache, bio, 0);
+        }
-        /*
+        build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key);
-         * REQ_PREFLUSH is not directed at any particular block so we don't
+        r = dm_cell_lock_v2(cache->prison, &key,
-         * need to inc_ds().  REQ_FUA's are split into a write + REQ_PREFLUSH
+                            READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell);
-         * by dm-core.
+        if (r < 0) {
-         */
+                free_prison_cell(cache, prealloc);
-        issue(cache, bio);
+                invalidate_complete(mg, false);
+                return r;
+        }
+        if (mg->cell != prealloc)
+                free_prison_cell(cache, prealloc);
+        if (r)
+                quiesce(mg, invalidate_remove);
+        else {
+                /*
+                 * We can't call invalidate_remove() directly here because we
+                 * might still be in request context.
+                 */
+                init_continuation(&mg->k, invalidate_remove);
+                queue_work(cache->wq, &mg->k.ws);
+        }
+        return 0;
 }
-static void process_discard_bio(struct cache *cache, struct prealloc *structs,
+static int invalidate_start(struct cache *cache, dm_cblock_t cblock,
-                                struct bio *bio)
+                            dm_oblock_t oblock, struct bio *bio)
 {
-        int r;
+        struct dm_cache_migration *mg;
-        dm_dblock_t b, e;
-        struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
-        calc_discard_block_range(cache, bio, &b, &e);
+        if (!background_work_begin(cache))
-        if (b == e) {
+                return -EPERM;
-                bio_endio(bio);
-                return;
+        mg = alloc_migration(cache);
+        if (!mg) {
+                background_work_end(cache);
+                return -ENOMEM;
        }
-        cell_prealloc = prealloc_get_cell(structs);
+        memset(mg, 0, sizeof(*mg));
-        r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc,
-                             (cell_free_fn) prealloc_put_cell,
-                             structs, &new_ocell);
-        if (r > 0)
-                return;
-        discard(cache, structs, new_ocell);
+        mg->cache = cache;
+        mg->overwrite_bio = bio;
+        mg->invalidate_cblock = cblock;
+        mg->invalidate_oblock = oblock;
+        return invalidate_lock(mg);
 }
-static bool spare_migration_bandwidth(struct cache *cache)
+/*----------------------------------------------------------------
+ * bio processing
+ *--------------------------------------------------------------*/
+enum busy {
+        IDLE,
+        MODERATE,
+        BUSY
+};
+static enum busy spare_migration_bandwidth(struct cache *cache)
 {
+        bool idle = iot_idle_for(&cache->origin_tracker, HZ);
        sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
                cache->sectors_per_block;
-        return current_volume < cache->migration_threshold;
+        if (current_volume <= cache->migration_threshold)
+                return idle ? IDLE : MODERATE;
+        else
+                return idle ? MODERATE : BUSY;
 }
 static void inc_hit_counter(struct cache *cache, struct bio *bio)
@@ -1660,255 +1746,143 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio)
 /*----------------------------------------------------------------*/
-struct inc_detail {
+static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
-        struct cache *cache;
-        struct bio_list bios_for_issue;
-        struct bio_list unhandled_bios;
-        bool any_writes;
-};
-static void inc_fn(void *context, struct dm_bio_prison_cell *cell)
 {
-        struct bio *bio;
+        return (bio_data_dir(bio) == WRITE) &&
-        struct inc_detail *detail = context;
+                (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
-        struct cache *cache = detail->cache;
-        inc_ds(cache, cell->holder, cell);
-        if (bio_data_dir(cell->holder) == WRITE)
-                detail->any_writes = true;
-        while ((bio = bio_list_pop(&cell->bios))) {
-                if (discard_or_flush(bio)) {
-                        bio_list_add(&detail->unhandled_bios, bio);
-                        continue;
-                }
-                if (bio_data_dir(bio) == WRITE)
-                        detail->any_writes = true;
-                bio_list_add(&detail->bios_for_issue, bio);
-                inc_ds(cache, bio, cell);
-        }
 }
-// FIXME: refactor these two
+static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block)
-static void remap_cell_to_origin_clear_discard(struct cache *cache,
-                                               struct dm_bio_prison_cell *cell,
-                                               dm_oblock_t oblock, bool issue_holder)
 {
-        struct bio *bio;
+        return writeback_mode(&cache->features) &&
-        unsigned long flags;
+                (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio));
-        struct inc_detail detail;
-        detail.cache = cache;
-        bio_list_init(&detail.bios_for_issue);
-        bio_list_init(&detail.unhandled_bios);
-        detail.any_writes = false;
-        spin_lock_irqsave(&cache->lock, flags);
-        dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
-        bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
-        spin_unlock_irqrestore(&cache->lock, flags);
-        remap_to_origin(cache, cell->holder);
-        if (issue_holder)
-                issue(cache, cell->holder);
-        else
-                accounted_begin(cache, cell->holder);
-        if (detail.any_writes)
-                clear_discard(cache, oblock_to_dblock(cache, oblock));
-        while ((bio = bio_list_pop(&detail.bios_for_issue))) {
-                remap_to_origin(cache, bio);
-                issue(cache, bio);
-        }
-        free_prison_cell(cache, cell);
 }
-static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell,
+static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block,
-                                      dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder)
+                   bool *commit_needed)
 {
-        struct bio *bio;
+        int r, data_dir;
-        unsigned long flags;
+        bool rb, background_queued;
-        struct inc_detail detail;
+        dm_cblock_t cblock;
+        size_t pb_data_size = get_per_bio_data_size(cache);
-        detail.cache = cache;
+        struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
-        bio_list_init(&detail.bios_for_issue);
-        bio_list_init(&detail.unhandled_bios);
-        detail.any_writes = false;
-        spin_lock_irqsave(&cache->lock, flags);
-        dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
-        bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
-        spin_unlock_irqrestore(&cache->lock, flags);
-        remap_to_cache(cache, cell->holder, cblock);
+        *commit_needed = false;
-        if (issue_holder)
-                issue(cache, cell->holder);
-        else
-                accounted_begin(cache, cell->holder);
-        if (detail.any_writes) {
+        rb = bio_detain_shared(cache, block, bio);
-                set_dirty(cache, oblock, cblock);
+        if (!rb) {
-                clear_discard(cache, oblock_to_dblock(cache, oblock));
+                /*
-        }
+                 * An exclusive lock is held for this block, so we have to
+                 * wait.  We set the commit_needed flag so the current
-        while ((bio = bio_list_pop(&detail.bios_for_issue))) {
+                 * transaction will be committed asap, allowing this lock
-                remap_to_cache(cache, bio, cblock);
+                 * to be dropped.
-                issue(cache, bio);
+                 */
+                *commit_needed = true;
+                return DM_MAPIO_SUBMITTED;
        }
-        free_prison_cell(cache, cell);
+        data_dir = bio_data_dir(bio);
-}
-/*----------------------------------------------------------------*/
+        if (optimisable_bio(cache, bio, block)) {
+                struct policy_work *op = NULL;
-struct old_oblock_lock {
-        struct policy_locker locker;
-        struct cache *cache;
-        struct prealloc *structs;
-        struct dm_bio_prison_cell *cell;
-};
-static int null_locker(struct policy_locker *locker, dm_oblock_t b)
+                r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op);
-{
+                if (unlikely(r && r != -ENOENT)) {
-        /* This should never be called */
+                        DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d",
-        BUG();
+                                    cache_device_name(cache), r);
-        return 0;
+                        bio_io_error(bio);
-}
+                        return DM_MAPIO_SUBMITTED;
+                }
-static int cell_locker(struct policy_locker *locker, dm_oblock_t b)
+                if (r == -ENOENT && op) {
-{
+                        bio_drop_shared_lock(cache, bio);
-        struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker);
+                        BUG_ON(op->op != POLICY_PROMOTE);
-        struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs);
+                        mg_start(cache, op, bio);
+                        return DM_MAPIO_SUBMITTED;
+                }
+        } else {
+                r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued);
+                if (unlikely(r && r != -ENOENT)) {
+                        DMERR_LIMIT("%s: policy_lookup() failed with r = %d",
+                                    cache_device_name(cache), r);
+                        bio_io_error(bio);
+                        return DM_MAPIO_SUBMITTED;
+                }
-        return bio_detain(l->cache, b, NULL, cell_prealloc,
+                if (background_queued)
-                          (cell_free_fn) prealloc_put_cell,
+                        wake_migration_worker(cache);
-                          l->structs, &l->cell);
+        }
-}
-static void process_cell(struct cache *cache, struct prealloc *structs,
+        if (r == -ENOENT) {
-                         struct dm_bio_prison_cell *new_ocell)
+                /*
-{
+                 * Miss.
-        int r;
+                 */
-        bool release_cell = true;
+                inc_miss_counter(cache, bio);
-        struct bio *bio = new_ocell->holder;
+                if (pb->req_nr == 0) {
-        dm_oblock_t block = get_bio_block(cache, bio);
+                        accounted_begin(cache, bio);
-        struct policy_result lookup_result;
+                        remap_to_origin_clear_discard(cache, bio, block);
-        bool passthrough = passthrough_mode(&cache->features);
-        bool fast_promotion, can_migrate;
-        struct old_oblock_lock ool;
-        fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
-        can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache));
-        ool.locker.fn = cell_locker;
-        ool.cache = cache;
-        ool.structs = structs;
-        ool.cell = NULL;
-        r = policy_map(cache->policy, block, true, can_migrate, fast_promotion,
-                       bio, &ool.locker, &lookup_result);
-        if (r == -EWOULDBLOCK)
-                /* migration has been denied */
-                lookup_result.op = POLICY_MISS;
-        switch (lookup_result.op) {
-        case POLICY_HIT:
-                if (passthrough) {
-                        inc_miss_counter(cache, bio);
+                } else {
                        /*
-                         * Passthrough always maps to the origin,
+                         * This is a duplicate writethrough io that is no
-                         * invalidating any cache blocks that are written
+                         * longer needed because the block has been demoted.
-                         * to.
                         */
+                        bio_endio(bio);
+                        return DM_MAPIO_SUBMITTED;
+                }
+        } else {
+                /*
+                 * Hit.
+                 */
+                inc_hit_counter(cache, bio);
+                /*
+                 * Passthrough always maps to the origin, invalidating any
+                 * cache blocks that are written to.
+                 */
+                if (passthrough_mode(&cache->features)) {
                        if (bio_data_dir(bio) == WRITE) {
+                                bio_drop_shared_lock(cache, bio);
                                atomic_inc(&cache->stats.demotion);
-                                invalidate(cache, structs, block, lookup_result.cblock, new_ocell);
+                                invalidate_start(cache, cblock, block, bio);
-                                release_cell = false;
+                        } else
-                        } else {
-                                /* FIXME: factor out issue_origin() */
                                remap_to_origin_clear_discard(cache, bio, block);
-                                inc_and_issue(cache, bio, new_ocell);
-                        }
                } else {
-                        inc_hit_counter(cache, bio);
+                        if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
+                            !is_dirty(cache, cblock)) {
-                        if (bio_data_dir(bio) == WRITE &&
+                                remap_to_origin_then_cache(cache, bio, block, cblock);
-                            writethrough_mode(&cache->features) &&
+                                accounted_begin(cache, bio);
-                            !is_dirty(cache, lookup_result.cblock)) {
+                        } else
-                                remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
+                                remap_to_cache_dirty(cache, bio, block, cblock);
-                                inc_and_issue(cache, bio, new_ocell);
-                        } else {
-                                remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true);
-                                release_cell = false;
-                        }
                }
-                break;
-        case POLICY_MISS:
-                inc_miss_counter(cache, bio);
-                remap_cell_to_origin_clear_discard(cache, new_ocell, block, true);
-                release_cell = false;
-                break;
-        case POLICY_NEW:
-                atomic_inc(&cache->stats.promotion);
-                promote(cache, structs, block, lookup_result.cblock, new_ocell);
-                release_cell = false;
-                break;
-        case POLICY_REPLACE:
-                atomic_inc(&cache->stats.demotion);
-                atomic_inc(&cache->stats.promotion);
-                demote_then_promote(cache, structs, lookup_result.old_oblock,
-                                    block, lookup_result.cblock,
-                                    ool.cell, new_ocell);
-                release_cell = false;
-                break;
-        default:
-                DMERR_LIMIT("%s: %s: erroring bio, unknown policy op: %u",
-                            cache_device_name(cache), __func__,
-                            (unsigned) lookup_result.op);
-                bio_io_error(bio);
        }
-        if (release_cell)
-                cell_defer(cache, new_ocell, false);
-}
-static void process_bio(struct cache *cache, struct prealloc *structs,
-                        struct bio *bio)
-{
-        int r;
-        dm_oblock_t block = get_bio_block(cache, bio);
-        struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
        /*
-         * Check to see if that block is currently migrating.
+         * dm core turns FUA requests into a separate payload and FLUSH req.
         */
-        cell_prealloc = prealloc_get_cell(structs);
+        if (bio->bi_opf & REQ_FUA) {
-        r = bio_detain(cache, block, bio, cell_prealloc,
+                /*
-                       (cell_free_fn) prealloc_put_cell,
+                 * issue_after_commit will call accounted_begin a second time.  So
-                       structs, &new_ocell);
+                 * we call accounted_complete() to avoid double accounting.
-        if (r > 0)
+                 */
-                return;
+                accounted_complete(cache, bio);
+                issue_after_commit(&cache->committer, bio);
+                *commit_needed = true;
+                return DM_MAPIO_SUBMITTED;
+        }
-        process_cell(cache, structs, new_ocell);
+        return DM_MAPIO_REMAPPED;
 }
-static int need_commit_due_to_time(struct cache *cache)
+static bool process_bio(struct cache *cache, struct bio *bio)
 {
-        return jiffies < cache->last_commit_jiffies ||
+        bool commit_needed;
-               jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
+        if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED)
+                generic_make_request(bio);
+        return commit_needed;
 }
 /*
@@ -1929,123 +1903,88 @@ static int commit(struct cache *cache, bool clean_shutdown)
        return r;
 }
-static int commit_if_needed(struct cache *cache)
+/*
+ * Used by the batcher.
+ */
+static int commit_op(void *context)
 {
-        int r = 0;
+        struct cache *cache = context;
-        if ((cache->commit_requested || need_commit_due_to_time(cache)) &&
+        if (dm_cache_changed_this_transaction(cache->cmd))
-            dm_cache_changed_this_transaction(cache->cmd)) {
+                return commit(cache, false);
-                r = commit(cache, false);
-                cache->commit_requested = false;
-                cache->last_commit_jiffies = jiffies;
-        }
-        return r;
+        return 0;
 }
-static void process_deferred_bios(struct cache *cache)
+/*----------------------------------------------------------------*/
-{
-        bool prealloc_used = false;
-        unsigned long flags;
-        struct bio_list bios;
-        struct bio *bio;
-        struct prealloc structs;
-        memset(&structs, 0, sizeof(structs));
-        bio_list_init(&bios);
-        spin_lock_irqsave(&cache->lock, flags);
-        bio_list_merge(&bios, &cache->deferred_bios);
-        bio_list_init(&cache->deferred_bios);
-        spin_unlock_irqrestore(&cache->lock, flags);
-        while (!bio_list_empty(&bios)) {
-                /*
-                 * If we've got no free migration structs, and processing
-                 * this bio might require one, we pause until there are some
-                 * prepared mappings to process.
-                 */
-                prealloc_used = true;
-                if (prealloc_data_structs(cache, &structs)) {
-                        spin_lock_irqsave(&cache->lock, flags);
-                        bio_list_merge(&cache->deferred_bios, &bios);
-                        spin_unlock_irqrestore(&cache->lock, flags);
-                        break;
-                }
-                bio = bio_list_pop(&bios);
+static bool process_flush_bio(struct cache *cache, struct bio *bio)
+{
+        size_t pb_data_size = get_per_bio_data_size(cache);
+        struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
-                if (bio->bi_opf & REQ_PREFLUSH)
+        if (!pb->req_nr)
-                        process_flush_bio(cache, bio);
+                remap_to_origin(cache, bio);
-                else if (bio_op(bio) == REQ_OP_DISCARD)
+        else
-                        process_discard_bio(cache, &structs, bio);
+                remap_to_cache(cache, bio, 0);
-                else
-                        process_bio(cache, &structs, bio);
-        }
-        if (prealloc_used)
+        issue_after_commit(&cache->committer, bio);
-                prealloc_free_structs(cache, &structs);
+        return true;
 }
-static void process_deferred_cells(struct cache *cache)
+static bool process_discard_bio(struct cache *cache, struct bio *bio)
 {
-        bool prealloc_used = false;
+        dm_dblock_t b, e;
-        unsigned long flags;
-        struct dm_bio_prison_cell *cell, *tmp;
-        struct list_head cells;
-        struct prealloc structs;
-        memset(&structs, 0, sizeof(structs));
-        INIT_LIST_HEAD(&cells);
-        spin_lock_irqsave(&cache->lock, flags);
-        list_splice_init(&cache->deferred_cells, &cells);
-        spin_unlock_irqrestore(&cache->lock, flags);
-        list_for_each_entry_safe(cell, tmp, &cells, user_list) {
-                /*
-                 * If we've got no free migration structs, and processing
-                 * this bio might require one, we pause until there are some
-                 * prepared mappings to process.
-                 */
-                prealloc_used = true;
-                if (prealloc_data_structs(cache, &structs)) {
-                        spin_lock_irqsave(&cache->lock, flags);
-                        list_splice(&cells, &cache->deferred_cells);
-                        spin_unlock_irqrestore(&cache->lock, flags);
-                        break;
-                }
-                process_cell(cache, &structs, cell);
+        // FIXME: do we need to lock the region?  Or can we just assume the
+        // user wont be so foolish as to issue discard concurrently with
+        // other IO?
+        calc_discard_block_range(cache, bio, &b, &e);
+        while (b != e) {
+                set_discard(cache, b);
+                b = to_dblock(from_dblock(b) + 1);
        }
-        if (prealloc_used)
+        bio_endio(bio);
-                prealloc_free_structs(cache, &structs);
+        return false;
 }
-static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
+static void process_deferred_bios(struct work_struct *ws)
 {
+        struct cache *cache = container_of(ws, struct cache, deferred_bio_worker);
        unsigned long flags;
+        bool commit_needed = false;
        struct bio_list bios;
        struct bio *bio;
        bio_list_init(&bios);
        spin_lock_irqsave(&cache->lock, flags);
-        bio_list_merge(&bios, &cache->deferred_flush_bios);
+        bio_list_merge(&bios, &cache->deferred_bios);
-        bio_list_init(&cache->deferred_flush_bios);
+        bio_list_init(&cache->deferred_bios);
        spin_unlock_irqrestore(&cache->lock, flags);
-        /*
+        while ((bio = bio_list_pop(&bios))) {
-         * These bios have already been through inc_ds()
+                if (bio->bi_opf & REQ_PREFLUSH)
-         */
+                        commit_needed = process_flush_bio(cache, bio) || commit_needed;
-        while ((bio = bio_list_pop(&bios)))
-                submit_bios ? accounted_request(cache, bio) : bio_io_error(bio);
+                else if (bio_op(bio) == REQ_OP_DISCARD)
+                        commit_needed = process_discard_bio(cache, bio) || commit_needed;
+                else
+                        commit_needed = process_bio(cache, bio) || commit_needed;
+        }
+        if (commit_needed)
+                schedule_commit(&cache->committer);
 }
-static void process_deferred_writethrough_bios(struct cache *cache)
+static void process_deferred_writethrough_bios(struct work_struct *ws)
 {
+        struct cache *cache = container_of(ws, struct cache, deferred_writethrough_worker);
        unsigned long flags;
        struct bio_list bios;
        struct bio *bio;
@@ -2058,153 +1997,15 @@ static void process_deferred_writethrough_bios(struct cache *cache)
        spin_unlock_irqrestore(&cache->lock, flags);
        /*
-         * These bios have already been through inc_ds()
+         * These bios have already been through accounted_begin()
         */
        while ((bio = bio_list_pop(&bios)))
-                accounted_request(cache, bio);
+                generic_make_request(bio);
-}
-static void writeback_some_dirty_blocks(struct cache *cache)
-{
-        bool prealloc_used = false;
-        dm_oblock_t oblock;
-        dm_cblock_t cblock;
-        struct prealloc structs;
-        struct dm_bio_prison_cell *old_ocell;
-        bool busy = !iot_idle_for(&cache->origin_tracker, HZ);
-        memset(&structs, 0, sizeof(structs));
-        while (spare_migration_bandwidth(cache)) {
-                if (policy_writeback_work(cache->policy, &oblock, &cblock, busy))
-                        break; /* no work to do */
-                prealloc_used = true;
-                if (prealloc_data_structs(cache, &structs) ||
-                    get_cell(cache, oblock, &structs, &old_ocell)) {
-                        policy_set_dirty(cache->policy, oblock);
-                        break;
-                }
-                writeback(cache, &structs, oblock, cblock, old_ocell);
-        }
-        if (prealloc_used)
-                prealloc_free_structs(cache, &structs);
-}
-/*----------------------------------------------------------------
- * Invalidations.
- * Dropping something from the cache *without* writing back.
- *--------------------------------------------------------------*/
-static void process_invalidation_request(struct cache *cache, struct invalidation_request *req)
-{
-        int r = 0;
-        uint64_t begin = from_cblock(req->cblocks->begin);
-        uint64_t end = from_cblock(req->cblocks->end);
-        while (begin != end) {
-                r = policy_remove_cblock(cache->policy, to_cblock(begin));
-                if (!r) {
-                        r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin));
-                        if (r) {
-                                metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
-                                break;
-                        }
-                } else if (r == -ENODATA) {
-                        /* harmless, already unmapped */
-                        r = 0;
-                } else {
-                        DMERR("%s: policy_remove_cblock failed", cache_device_name(cache));
-                        break;
-                }
-                begin++;
-        }
-        cache->commit_requested = true;
-        req->err = r;
-        atomic_set(&req->complete, 1);
-        wake_up(&req->result_wait);
-}
-static void process_invalidation_requests(struct cache *cache)
-{
-        struct list_head list;
-        struct invalidation_request *req, *tmp;
-        INIT_LIST_HEAD(&list);
-        spin_lock(&cache->invalidation_lock);
-        list_splice_init(&cache->invalidation_requests, &list);
-        spin_unlock(&cache->invalidation_lock);
-        list_for_each_entry_safe (req, tmp, &list, list)
-                process_invalidation_request(cache, req);
 }
 /*----------------------------------------------------------------
 * Main worker loop
 *--------------------------------------------------------------*/
-static bool is_quiescing(struct cache *cache)
-{
-        return atomic_read(&cache->quiescing);
-}
-static void ack_quiescing(struct cache *cache)
-{
-        if (is_quiescing(cache)) {
-                atomic_inc(&cache->quiescing_ack);
-                wake_up(&cache->quiescing_wait);
-        }
-}
-static void wait_for_quiescing_ack(struct cache *cache)
-{
-        wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack));
-}
-static void start_quiescing(struct cache *cache)
-{
-        atomic_inc(&cache->quiescing);
-        wait_for_quiescing_ack(cache);
-}
-static void stop_quiescing(struct cache *cache)
-{
-        atomic_set(&cache->quiescing, 0);
-        atomic_set(&cache->quiescing_ack, 0);
-}
-static void wait_for_migrations(struct cache *cache)
-{
-        wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations));
-}
-static void stop_worker(struct cache *cache)
-{
-        cancel_delayed_work(&cache->waker);
-        flush_workqueue(cache->wq);
-}
-static void requeue_deferred_cells(struct cache *cache)
-{
-        unsigned long flags;
-        struct list_head cells;
-        struct dm_bio_prison_cell *cell, *tmp;
-        INIT_LIST_HEAD(&cells);
-        spin_lock_irqsave(&cache->lock, flags);
-        list_splice_init(&cache->deferred_cells, &cells);
-        spin_unlock_irqrestore(&cache->lock, flags);
-        list_for_each_entry_safe(cell, tmp, &cells, user_list)
-                cell_requeue(cache, cell);
-}
 static void requeue_deferred_bios(struct cache *cache)
 {
@@ -2221,53 +2022,6 @@ static void requeue_deferred_bios(struct cache *cache)
        }
 }
-static int more_work(struct cache *cache)
-{
-        if (is_quiescing(cache))
-                return !list_empty(&cache->quiesced_migrations) ||
-                        !list_empty(&cache->completed_migrations) ||
-                        !list_empty(&cache->need_commit_migrations);
-        else
-                return !bio_list_empty(&cache->deferred_bios) ||
-                        !list_empty(&cache->deferred_cells) ||
-                        !bio_list_empty(&cache->deferred_flush_bios) ||
-                        !bio_list_empty(&cache->deferred_writethrough_bios) ||
-                        !list_empty(&cache->quiesced_migrations) ||
-                        !list_empty(&cache->completed_migrations) ||
-                        !list_empty(&cache->need_commit_migrations) ||
-                        cache->invalidate;
-}
-static void do_worker(struct work_struct *ws)
-{
-        struct cache *cache = container_of(ws, struct cache, worker);
-        do {
-                if (!is_quiescing(cache)) {
-                        writeback_some_dirty_blocks(cache);
-                        process_deferred_writethrough_bios(cache);
-                        process_deferred_bios(cache);
-                        process_deferred_cells(cache);
-                        process_invalidation_requests(cache);
-                }
-                process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard);
-                process_migrations(cache, &cache->completed_migrations, complete_migration);
-                if (commit_if_needed(cache)) {
-                        process_deferred_flush_bios(cache, false);
-                        process_migrations(cache, &cache->need_commit_migrations, migration_failure);
-                } else {
-                        process_deferred_flush_bios(cache, true);
-                        process_migrations(cache, &cache->need_commit_migrations,
-                                           migration_success_post_commit);
-                }
-                ack_quiescing(cache);
-        } while (more_work(cache));
-}
 /*
 * We want to commit periodically so that not too much
 * unwritten metadata builds up.
@@ -2275,25 +2029,39 @@ static void do_worker(struct work_struct *ws)
 static void do_waker(struct work_struct *ws)
 {
        struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
        policy_tick(cache->policy, true);
-        wake_worker(cache);
+        wake_migration_worker(cache);
+        schedule_commit(&cache->committer);
        queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
 }
-/*----------------------------------------------------------------*/
+static void check_migrations(struct work_struct *ws)
-static int is_congested(struct dm_dev *dev, int bdi_bits)
 {
-        struct request_queue *q = bdev_get_queue(dev->bdev);
+        int r;
-        return bdi_congested(q->backing_dev_info, bdi_bits);
+        struct policy_work *op;
-}
+        struct cache *cache = container_of(ws, struct cache, migration_worker);
+        enum busy b;
-static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
+        for (;;) {
-{
+                b = spare_migration_bandwidth(cache);
-        struct cache *cache = container_of(cb, struct cache, callbacks);
+                if (b == BUSY)
+                        break;
-        return is_congested(cache->origin_dev, bdi_bits) ||
+                r = policy_get_background_work(cache->policy, b == IDLE, &op);
-                is_congested(cache->cache_dev, bdi_bits);
+                if (r == -ENODATA)
+                        break;
+                if (r) {
+                        DMERR_LIMIT("%s: policy_background_work failed",
+                                    cache_device_name(cache));
+                        break;
+                }
+                r = mg_start(cache, op, NULL);
+                if (r)
+                        break;
+        }
 }
 /*----------------------------------------------------------------
@@ -2310,11 +2078,8 @@ static void destroy(struct cache *cache)
        mempool_destroy(cache->migration_pool);
-        if (cache->all_io_ds)
-                dm_deferred_set_destroy(cache->all_io_ds);
        if (cache->prison)
-                dm_bio_prison_destroy(cache->prison);
+                dm_bio_prison_destroy_v2(cache->prison);
        if (cache->wq)
                destroy_workqueue(cache->wq);
@@ -2707,6 +2472,7 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca,
                return PTR_ERR(p);
        }
        cache->policy = p;
+        BUG_ON(!cache->policy);
        return 0;
 }
@@ -2750,6 +2516,20 @@ static void set_cache_size(struct cache *cache, dm_cblock_t size)
        cache->cache_size = size;
 }
+static int is_congested(struct dm_dev *dev, int bdi_bits)
+{
+        struct request_queue *q = bdev_get_queue(dev->bdev);
+        return bdi_congested(q->backing_dev_info, bdi_bits);
+}
+static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
+{
+        struct cache *cache = container_of(cb, struct cache, callbacks);
+        return is_congested(cache->origin_dev, bdi_bits) ||
+                is_congested(cache->cache_dev, bdi_bits);
+}
 #define DEFAULT_MIGRATION_THRESHOLD 2048
 static int cache_create(struct cache_args *ca, struct cache **result)
@@ -2788,7 +2568,6 @@ static int cache_create(struct cache_args *ca, struct cache **result)
        ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
-        /* FIXME: factor out this whole section */
        origin_blocks = cache->origin_sectors = ca->origin_sectors;
        origin_blocks = block_div(origin_blocks, ca->block_size);
        cache->origin_blocks = to_oblock(origin_blocks);
@@ -2854,24 +2633,18 @@ static int cache_create(struct cache_args *ca, struct cache **result)
                        r = -EINVAL;
                        goto bad;
                }
+                policy_allow_migrations(cache->policy, false);
        }
        spin_lock_init(&cache->lock);
        INIT_LIST_HEAD(&cache->deferred_cells);
        bio_list_init(&cache->deferred_bios);
-        bio_list_init(&cache->deferred_flush_bios);
        bio_list_init(&cache->deferred_writethrough_bios);
-        INIT_LIST_HEAD(&cache->quiesced_migrations);
-        INIT_LIST_HEAD(&cache->completed_migrations);
-        INIT_LIST_HEAD(&cache->need_commit_migrations);
        atomic_set(&cache->nr_allocated_migrations, 0);
        atomic_set(&cache->nr_io_migrations, 0);
        init_waitqueue_head(&cache->migration_wait);
-        init_waitqueue_head(&cache->quiescing_wait);
-        atomic_set(&cache->quiescing, 0);
-        atomic_set(&cache->quiescing_ack, 0);
        r = -ENOMEM;
        atomic_set(&cache->nr_dirty, 0);
        cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
@@ -2900,27 +2673,23 @@ static int cache_create(struct cache_args *ca, struct cache **result)
                goto bad;
        }
-        cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
+        cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
        if (!cache->wq) {
                *error = "could not create workqueue for metadata object";
                goto bad;
        }
-        INIT_WORK(&cache->worker, do_worker);
+        INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios);
+        INIT_WORK(&cache->deferred_writethrough_worker,
+                  process_deferred_writethrough_bios);
+        INIT_WORK(&cache->migration_worker, check_migrations);
        INIT_DELAYED_WORK(&cache->waker, do_waker);
-        cache->last_commit_jiffies = jiffies;
-        cache->prison = dm_bio_prison_create();
+        cache->prison = dm_bio_prison_create_v2(cache->wq);
        if (!cache->prison) {
                *error = "could not create bio prison";
                goto bad;
        }
-        cache->all_io_ds = dm_deferred_set_create();
-        if (!cache->all_io_ds) {
-                *error = "could not create all_io deferred set";
-                goto bad;
-        }
        cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
                                                         migration_cache);
        if (!cache->migration_pool) {
@@ -2947,11 +2716,15 @@ static int cache_create(struct cache_args *ca, struct cache **result)
        spin_lock_init(&cache->invalidation_lock);
        INIT_LIST_HEAD(&cache->invalidation_requests);
+        batcher_init(&cache->committer, commit_op, cache,
+                     issue_op, cache, cache->wq);
        iot_init(&cache->origin_tracker);
+        init_rwsem(&cache->background_work_lock);
+        prevent_background_work(cache);
        *result = cache;
        return 0;
 bad:
        destroy(cache);
        return r;
@@ -3009,7 +2782,6 @@ static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
        }
        ti->private = cache;
 out:
        destroy_cache_args(ca);
        return r;
@@ -3022,17 +2794,11 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
        struct cache *cache = ti->private;
        int r;
-        struct dm_bio_prison_cell *cell = NULL;
+        bool commit_needed;
        dm_oblock_t block = get_bio_block(cache, bio);
        size_t pb_data_size = get_per_bio_data_size(cache);
-        bool can_migrate = false;
-        bool fast_promotion;
-        struct policy_result lookup_result;
-        struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
-        struct old_oblock_lock ool;
-        ool.locker.fn = null_locker;
+        init_per_bio_data(bio, pb_data_size);
        if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
                /*
                 * This can only occur if the io goes to a partial block at
@@ -3049,101 +2815,9 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
                return DM_MAPIO_SUBMITTED;
        }
-        /*
+        r = map_bio(cache, bio, block, &commit_needed);
-         * Check to see if that block is currently migrating.
+        if (commit_needed)
-         */
+                schedule_commit(&cache->committer);
-        cell = alloc_prison_cell(cache);
-        if (!cell) {
-                defer_bio(cache, bio);
-                return DM_MAPIO_SUBMITTED;
-        }
-        r = bio_detain(cache, block, bio, cell,
-                       (cell_free_fn) free_prison_cell,
-                       cache, &cell);
-        if (r) {
-                if (r < 0)
-                        defer_bio(cache, bio);
-                return DM_MAPIO_SUBMITTED;
-        }
-        fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
-        r = policy_map(cache->policy, block, false, can_migrate, fast_promotion,
-                       bio, &ool.locker, &lookup_result);
-        if (r == -EWOULDBLOCK) {
-                cell_defer(cache, cell, true);
-                return DM_MAPIO_SUBMITTED;
-        } else if (r) {
-                DMERR_LIMIT("%s: Unexpected return from cache replacement policy: %d",
-                            cache_device_name(cache), r);
-                cell_defer(cache, cell, false);
-                bio_io_error(bio);
-                return DM_MAPIO_SUBMITTED;
-        }
-        r = DM_MAPIO_REMAPPED;
-        switch (lookup_result.op) {
-        case POLICY_HIT:
-                if (passthrough_mode(&cache->features)) {
-                        if (bio_data_dir(bio) == WRITE) {
-                                /*
-                                 * We need to invalidate this block, so
-                                 * defer for the worker thread.
-                                 */
-                                cell_defer(cache, cell, true);
-                                r = DM_MAPIO_SUBMITTED;
-                        } else {
-                                inc_miss_counter(cache, bio);
-                                remap_to_origin_clear_discard(cache, bio, block);
-                                accounted_begin(cache, bio);
-                                inc_ds(cache, bio, cell);
-                                // FIXME: we want to remap hits or misses straight
-                                // away rather than passing over to the worker.
-                                cell_defer(cache, cell, false);
-                        }
-                } else {
-                        inc_hit_counter(cache, bio);
-                        if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
-                            !is_dirty(cache, lookup_result.cblock)) {
-                                remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
-                                accounted_begin(cache, bio);
-                                inc_ds(cache, bio, cell);
-                                cell_defer(cache, cell, false);
-                        } else
-                                remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false);
-                }
-                break;
-        case POLICY_MISS:
-                inc_miss_counter(cache, bio);
-                if (pb->req_nr != 0) {
-                        /*
-                         * This is a duplicate writethrough io that is no
-                         * longer needed because the block has been demoted.
-                         */
-                        bio_endio(bio);
-                        // FIXME: remap everything as a miss
-                        cell_defer(cache, cell, false);
-                        r = DM_MAPIO_SUBMITTED;
-                } else
-                        remap_cell_to_origin_clear_discard(cache, cell, block, false);
-                break;
-        default:
-                DMERR_LIMIT("%s: %s: erroring bio: unknown policy op: %u",
-                            cache_device_name(cache), __func__,
-                            (unsigned) lookup_result.op);
-                cell_defer(cache, cell, false);
-                bio_io_error(bio);
-                r = DM_MAPIO_SUBMITTED;
-        }
        return r;
 }
@@ -3163,7 +2837,7 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
                spin_unlock_irqrestore(&cache->lock, flags);
        }
-        check_for_quiesced_migrations(cache, pb);
+        bio_drop_shared_lock(cache, bio);
        accounted_complete(cache, bio);
        return 0;
@@ -3263,12 +2937,18 @@ static void cache_postsuspend(struct dm_target *ti)
 {
        struct cache *cache = ti->private;
-        start_quiescing(cache);
+        prevent_background_work(cache);
-        wait_for_migrations(cache);
+        BUG_ON(atomic_read(&cache->nr_io_migrations));
-        stop_worker(cache);
+        cancel_delayed_work(&cache->waker);
+        flush_workqueue(cache->wq);
+        WARN_ON(cache->origin_tracker.in_flight);
+        /*
+         * If it's a flush suspend there won't be any deferred bios, so this
+         * call is harmless.
+         */
        requeue_deferred_bios(cache);
-        requeue_deferred_cells(cache);
-        stop_quiescing(cache);
        if (get_cache_mode(cache) == CM_WRITE)
                (void) sync_metadata(cache);
@@ -3280,15 +2960,10 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
        int r;
        struct cache *cache = context;
-        r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
+        r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid);
        if (r)
                return r;
-        if (dirty)
-                set_dirty(cache, oblock, cblock);
-        else
-                clear_dirty(cache, oblock, cblock);
        return 0;
 }
@@ -3487,6 +3162,7 @@ static void cache_resume(struct dm_target *ti)
        struct cache *cache = ti->private;
        cache->need_tick_bio = true;
+        allow_background_work(cache);
        do_waker(&cache->waker.work);
 }
@@ -3621,10 +3297,19 @@ err:
 }
 /*
+ * Defines a range of cblocks, begin to (end - 1) are in the range.  end is
+ * the one-past-the-end value.
+ */
+struct cblock_range {
+        dm_cblock_t begin;
+        dm_cblock_t end;
+};
+/*
 * A cache block range can take two forms:
 *
 * i) A single cblock, eg. '3456'
- * ii) A begin and end cblock with dots between, eg. 123-234
+ * ii) A begin and end cblock with a dash between, eg. 123-234
 */
 static int parse_cblock_range(struct cache *cache, const char *str,
                              struct cblock_range *result)
@@ -3690,23 +3375,31 @@ static int validate_cblock_range(struct cache *cache, struct cblock_range *range
        return 0;
 }
+static inline dm_cblock_t cblock_succ(dm_cblock_t b)
+{
+        return to_cblock(from_cblock(b) + 1);
+}
 static int request_invalidation(struct cache *cache, struct cblock_range *range)
 {
-        struct invalidation_request req;
+        int r = 0;
-        INIT_LIST_HEAD(&req.list);
+        /*
-        req.cblocks = range;
+         * We don't need to do any locking here because we know we're in
-        atomic_set(&req.complete, 0);
+         * passthrough mode.  There's is potential for a race between an
-        req.err = 0;
+         * invalidation triggered by an io and an invalidation message.  This
-        init_waitqueue_head(&req.result_wait);
+         * is harmless, we must not worry if the policy call fails.
+         */
+        while (range->begin != range->end) {
+                r = invalidate_cblock(cache, range->begin);
+                if (r)
+                        return r;
-        spin_lock(&cache->invalidation_lock);
+                range->begin = cblock_succ(range->begin);
-        list_add(&req.list, &cache->invalidation_requests);
+        }
-        spin_unlock(&cache->invalidation_lock);
-        wake_worker(cache);
-        wait_event(req.result_wait, atomic_read(&req.complete));
+        cache->commit_requested = true;
-        return req.err;
+        return r;
 }
 static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
@@ -3816,7 +3509,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
 static struct target_type cache_target = {
        .name = "cache",
-        .version = {1, 10, 0},
+        .version = {2, 0, 0},
        .module = THIS_MODULE,
        .ctr = cache_ctr,
        .dtr = cache_dtr,
author	Joe Thornber <ejt@redhat.com>	2016-12-15 04:57:31 -0500
committer	Mike Snitzer <snitzer@redhat.com>	2017-03-07 13:28:31 -0500
commit	b29d4986d0da1a27cd35917cdb433672f5c95d7f (patch)
tree	a5d94b86cf1eb759bfef5761015135d747e80561 /drivers/md
parent	742c8fdc31e820503f9267070311d894978d1349 (diff)