16 files changed, 1466 insertions, 455 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 30b426ed744b..f2ccbc3b9fe4 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -297,6 +297,17 @@ config DM_MIRROR
         Allow volume managers to mirror logical volumes, also
         needed for live data migration tools such as 'pvmove'.
+config DM_LOG_USERSPACE
+        tristate "Mirror userspace logging"
+        depends on DM_MIRROR && NET
+        select CONNECTOR
+        ---help---
+          The userspace logging module provides a mechanism for
+          relaying the dm-dirty-log API to userspace.  Log designs
+          which are more suited to userspace implementation (e.g.
+          shared storage logs) or experimental logs can be implemented
+          by leveraging this framework.
 config DM_RAID
       tristate "RAID 1/4/5/6/10 target"
       depends on BLK_DEV_DM
@@ -323,17 +334,6 @@ config DM_RAID
         RAID-5, RAID-6 distributes the syndromes across the drives
         in one of the available parity distribution methods.
-config DM_LOG_USERSPACE
-        tristate "Mirror userspace logging"
-        depends on DM_MIRROR && NET
-        select CONNECTOR
-        ---help---
-          The userspace logging module provides a mechanism for
-          relaying the dm-dirty-log API to userspace.  Log designs
-          which are more suited to userspace implementation (e.g.
-          shared storage logs) or experimental logs can be implemented
-          by leveraging this framework.
 config DM_ZERO
        tristate "Zero target"
        depends on BLK_DEV_DM
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index 1af7255bbffb..9ef0752e8a08 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -20,7 +20,13 @@
 #define CACHE_SUPERBLOCK_MAGIC 06142003
 #define CACHE_SUPERBLOCK_LOCATION 0
-#define CACHE_VERSION 1
+/*
+ * defines a range of metadata versions that this module can handle.
+ */
+#define MIN_CACHE_VERSION 1
+#define MAX_CACHE_VERSION 1
 #define CACHE_METADATA_CACHE_SIZE 64
 /*
@@ -134,6 +140,18 @@ static void sb_prepare_for_write(struct dm_block_validator *v,
                                                      SUPERBLOCK_CSUM_XOR));
 }
+static int check_metadata_version(struct cache_disk_superblock *disk_super)
+{
+        uint32_t metadata_version = le32_to_cpu(disk_super->version);
+        if (metadata_version < MIN_CACHE_VERSION || metadata_version > MAX_CACHE_VERSION) {
+                DMERR("Cache metadata version %u found, but only versions between %u and %u supported.",
+                      metadata_version, MIN_CACHE_VERSION, MAX_CACHE_VERSION);
+                return -EINVAL;
+        }
+        return 0;
+}
 static int sb_check(struct dm_block_validator *v,
                    struct dm_block *b,
                    size_t sb_block_size)
@@ -164,7 +182,7 @@ static int sb_check(struct dm_block_validator *v,
                return -EILSEQ;
        }
-        return 0;
+        return check_metadata_version(disk_super);
 }
 static struct dm_block_validator sb_validator = {
@@ -198,7 +216,7 @@ static int superblock_lock(struct dm_cache_metadata *cmd,
 /*----------------------------------------------------------------*/
-static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
+static int __superblock_all_zeroes(struct dm_block_manager *bm, bool *result)
 {
        int r;
        unsigned i;
@@ -214,10 +232,10 @@ static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
                return r;
        data_le = dm_block_data(b);
-        *result = 1;
+        *result = true;
        for (i = 0; i < sb_block_size; i++) {
                if (data_le[i] != zero) {
-                        *result = 0;
+                        *result = false;
                        break;
                }
        }
@@ -270,7 +288,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
        disk_super->flags = 0;
        memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
        disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC);
-        disk_super->version = cpu_to_le32(CACHE_VERSION);
+        disk_super->version = cpu_to_le32(MAX_CACHE_VERSION);
        memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name));
        memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version));
        disk_super->policy_hint_size = 0;
@@ -411,7 +429,8 @@ bad:
 static int __open_or_format_metadata(struct dm_cache_metadata *cmd,
                                     bool format_device)
 {
-        int r, unformatted;
+        int r;
+        bool unformatted = false;
        r = __superblock_all_zeroes(cmd->bm, &unformatted);
        if (r)
@@ -666,19 +685,85 @@ void dm_cache_metadata_close(struct dm_cache_metadata *cmd)
        kfree(cmd);
 }
+/*
+ * Checks that the given cache block is either unmapped or clean.
+ */
+static int block_unmapped_or_clean(struct dm_cache_metadata *cmd, dm_cblock_t b,
+                                   bool *result)
+{
+        int r;
+        __le64 value;
+        dm_oblock_t ob;
+        unsigned flags;
+        r = dm_array_get_value(&cmd->info, cmd->root, from_cblock(b), &value);
+        if (r) {
+                DMERR("block_unmapped_or_clean failed");
+                return r;
+        }
+        unpack_value(value, &ob, &flags);
+        *result = !((flags & M_VALID) && (flags & M_DIRTY));
+        return 0;
+}
+static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd,
+                                        dm_cblock_t begin, dm_cblock_t end,
+                                        bool *result)
+{
+        int r;
+        *result = true;
+        while (begin != end) {
+                r = block_unmapped_or_clean(cmd, begin, result);
+                if (r)
+                        return r;
+                if (!*result) {
+                        DMERR("cache block %llu is dirty",
+                              (unsigned long long) from_cblock(begin));
+                        return 0;
+                }
+                begin = to_cblock(from_cblock(begin) + 1);
+        }
+        return 0;
+}
 int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
 {
        int r;
+        bool clean;
        __le64 null_mapping = pack_value(0, 0);
        down_write(&cmd->root_lock);
        __dm_bless_for_disk(&null_mapping);
+        if (from_cblock(new_cache_size) < from_cblock(cmd->cache_blocks)) {
+                r = blocks_are_unmapped_or_clean(cmd, new_cache_size, cmd->cache_blocks, &clean);
+                if (r) {
+                        __dm_unbless_for_disk(&null_mapping);
+                        goto out;
+                }
+                if (!clean) {
+                        DMERR("unable to shrink cache due to dirty blocks");
+                        r = -EINVAL;
+                        __dm_unbless_for_disk(&null_mapping);
+                        goto out;
+                }
+        }
        r = dm_array_resize(&cmd->info, cmd->root, from_cblock(cmd->cache_blocks),
                            from_cblock(new_cache_size),
                            &null_mapping, &cmd->root);
        if (!r)
                cmd->cache_blocks = new_cache_size;
        cmd->changed = true;
+out:
        up_write(&cmd->root_lock);
        return r;
@@ -1182,3 +1267,8 @@ int dm_cache_save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock,
        return r;
 }
+int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result)
+{
+        return blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result);
+}
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index f45cef21f3d0..cd906f14f98d 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -137,6 +137,11 @@ int dm_cache_begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *
 int dm_cache_save_hint(struct dm_cache_metadata *cmd,
                       dm_cblock_t cblock, uint32_t hint);
+/*
+ * Query method.  Are all the blocks in the cache clean?
+ */
+int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result);
 /*----------------------------------------------------------------*/
 #endif /* DM_CACHE_METADATA_H */
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
index 0928abdc49f0..2256a1f24f73 100644
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -61,7 +61,12 @@ static inline int policy_writeback_work(struct dm_cache_policy *p,
 static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
 {
-        return p->remove_mapping(p, oblock);
+        p->remove_mapping(p, oblock);
+}
+static inline int policy_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
+{
+        return p->remove_cblock(p, cblock);
 }
 static inline void policy_force_mapping(struct dm_cache_policy *p,
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 4296155090b2..416b7b752a6e 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -26,19 +26,6 @@ static unsigned next_power(unsigned n, unsigned min)
 /*----------------------------------------------------------------*/
-static unsigned long *alloc_bitset(unsigned nr_entries)
-{
-        size_t s = sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
-        return vzalloc(s);
-}
-static void free_bitset(unsigned long *bits)
-{
-        vfree(bits);
-}
-/*----------------------------------------------------------------*/
 /*
 * Large, sequential ios are probably better left on the origin device since
 * spindles tend to have good bandwidth.
@@ -151,6 +138,21 @@ static void queue_init(struct queue *q)
 }
 /*
+ * Checks to see if the queue is empty.
+ * FIXME: reduce cpu usage.
+ */
+static bool queue_empty(struct queue *q)
+{
+        unsigned i;
+        for (i = 0; i < NR_QUEUE_LEVELS; i++)
+                if (!list_empty(q->qs + i))
+                        return false;
+        return true;
+}
+/*
 * Insert an entry to the back of the given level.
 */
 static void queue_push(struct queue *q, unsigned level, struct list_head *elt)
@@ -218,17 +220,116 @@ struct entry {
        struct hlist_node hlist;
        struct list_head list;
        dm_oblock_t oblock;
-        dm_cblock_t cblock;     /* valid iff in_cache */
        /*
         * FIXME: pack these better
         */
-        bool in_cache:1;
+        bool dirty:1;
        unsigned hit_count;
        unsigned generation;
        unsigned tick;
 };
+/*
+ * Rather than storing the cblock in an entry, we allocate all entries in
+ * an array, and infer the cblock from the entry position.
+ *
+ * Free entries are linked together into a list.
+ */
+struct entry_pool {
+        struct entry *entries, *entries_end;
+        struct list_head free;
+        unsigned nr_allocated;
+};
+static int epool_init(struct entry_pool *ep, unsigned nr_entries)
+{
+        unsigned i;
+        ep->entries = vzalloc(sizeof(struct entry) * nr_entries);
+        if (!ep->entries)
+                return -ENOMEM;
+        ep->entries_end = ep->entries + nr_entries;
+        INIT_LIST_HEAD(&ep->free);
+        for (i = 0; i < nr_entries; i++)
+                list_add(&ep->entries[i].list, &ep->free);
+        ep->nr_allocated = 0;
+        return 0;
+}
+static void epool_exit(struct entry_pool *ep)
+{
+        vfree(ep->entries);
+}
+static struct entry *alloc_entry(struct entry_pool *ep)
+{
+        struct entry *e;
+        if (list_empty(&ep->free))
+                return NULL;
+        e = list_entry(list_pop(&ep->free), struct entry, list);
+        INIT_LIST_HEAD(&e->list);
+        INIT_HLIST_NODE(&e->hlist);
+        ep->nr_allocated++;
+        return e;
+}
+/*
+ * This assumes the cblock hasn't already been allocated.
+ */
+static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock)
+{
+        struct entry *e = ep->entries + from_cblock(cblock);
+        list_del(&e->list);
+        INIT_LIST_HEAD(&e->list);
+        INIT_HLIST_NODE(&e->hlist);
+        ep->nr_allocated++;
+        return e;
+}
+static void free_entry(struct entry_pool *ep, struct entry *e)
+{
+        BUG_ON(!ep->nr_allocated);
+        ep->nr_allocated--;
+        INIT_HLIST_NODE(&e->hlist);
+        list_add(&e->list, &ep->free);
+}
+/*
+ * Returns NULL if the entry is free.
+ */
+static struct entry *epool_find(struct entry_pool *ep, dm_cblock_t cblock)
+{
+        struct entry *e = ep->entries + from_cblock(cblock);
+        return !hlist_unhashed(&e->hlist) ? e : NULL;
+}
+static bool epool_empty(struct entry_pool *ep)
+{
+        return list_empty(&ep->free);
+}
+static bool in_pool(struct entry_pool *ep, struct entry *e)
+{
+        return e >= ep->entries && e < ep->entries_end;
+}
+static dm_cblock_t infer_cblock(struct entry_pool *ep, struct entry *e)
+{
+        return to_cblock(e - ep->entries);
+}
+/*----------------------------------------------------------------*/
 struct mq_policy {
        struct dm_cache_policy policy;
@@ -238,13 +339,22 @@ struct mq_policy {
        struct io_tracker tracker;
        /*
-         * We maintain two queues of entries.  The cache proper contains
+         * Entries come from two pools, one of pre-cache entries, and one
-         * the currently active mappings.  Whereas the pre_cache tracks
+         * for the cache proper.
-         * blocks that are being hit frequently and potential candidates
+         */
-         * for promotion to the cache.
+        struct entry_pool pre_cache_pool;
+        struct entry_pool cache_pool;
+        /*
+         * We maintain three queues of entries.  The cache proper,
+         * consisting of a clean and dirty queue, contains the currently
+         * active mappings.  Whereas the pre_cache tracks blocks that
+         * are being hit frequently and potential candidates for promotion
+         * to the cache.
         */
        struct queue pre_cache;
-        struct queue cache;
+        struct queue cache_clean;
+        struct queue cache_dirty;
        /*
         * Keeps track of time, incremented by the core.  We use this to
@@ -282,25 +392,6 @@ struct mq_policy {
        unsigned promote_threshold;
        /*
-         * We need cache_size entries for the cache, and choose to have
-         * cache_size entries for the pre_cache too.  One motivation for
-         * using the same size is to make the hit counts directly
-         * comparable between pre_cache and cache.
-         */
-        unsigned nr_entries;
-        unsigned nr_entries_allocated;
-        struct list_head free;
-        /*
-         * Cache blocks may be unallocated.  We store this info in a
-         * bitset.
-         */
-        unsigned long *allocation_bitset;
-        unsigned nr_cblocks_allocated;
-        unsigned find_free_nr_words;
-        unsigned find_free_last_word;
-        /*
         * The hash table allows us to quickly find an entry by origin
         * block.  Both pre_cache and cache entries are in here.
         */
@@ -310,49 +401,6 @@ struct mq_policy {
 };
 /*----------------------------------------------------------------*/
-/* Free/alloc mq cache entry structures. */
-static void takeout_queue(struct list_head *lh, struct queue *q)
-{
-        unsigned level;
-        for (level = 0; level < NR_QUEUE_LEVELS; level++)
-                list_splice(q->qs + level, lh);
-}
-static void free_entries(struct mq_policy *mq)
-{
-        struct entry *e, *tmp;
-        takeout_queue(&mq->free, &mq->pre_cache);
-        takeout_queue(&mq->free, &mq->cache);
-        list_for_each_entry_safe(e, tmp, &mq->free, list)
-                kmem_cache_free(mq_entry_cache, e);
-}
-static int alloc_entries(struct mq_policy *mq, unsigned elts)
-{
-        unsigned u = mq->nr_entries;
-        INIT_LIST_HEAD(&mq->free);
-        mq->nr_entries_allocated = 0;
-        while (u--) {
-                struct entry *e = kmem_cache_zalloc(mq_entry_cache, GFP_KERNEL);
-                if (!e) {
-                        free_entries(mq);
-                        return -ENOMEM;
-                }
-                list_add(&e->list, &mq->free);
-        }
-        return 0;
-}
-/*----------------------------------------------------------------*/
 /*
 * Simple hash table implementation.  Should replace with the standard hash
@@ -388,96 +436,14 @@ static void hash_remove(struct entry *e)
 /*----------------------------------------------------------------*/
-/*
- * Allocates a new entry structure.  The memory is allocated in one lump,
- * so we just handing it out here.  Returns NULL if all entries have
- * already been allocated.  Cannot fail otherwise.
- */
-static struct entry *alloc_entry(struct mq_policy *mq)
-{
-        struct entry *e;
-        if (mq->nr_entries_allocated >= mq->nr_entries) {
-                BUG_ON(!list_empty(&mq->free));
-                return NULL;
-        }
-        e = list_entry(list_pop(&mq->free), struct entry, list);
-        INIT_LIST_HEAD(&e->list);
-        INIT_HLIST_NODE(&e->hlist);
-        mq->nr_entries_allocated++;
-        return e;
-}
-/*----------------------------------------------------------------*/
-/*
- * Mark cache blocks allocated or not in the bitset.
- */
-static void alloc_cblock(struct mq_policy *mq, dm_cblock_t cblock)
-{
-        BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size));
-        BUG_ON(test_bit(from_cblock(cblock), mq->allocation_bitset));
-        set_bit(from_cblock(cblock), mq->allocation_bitset);
-        mq->nr_cblocks_allocated++;
-}
-static void free_cblock(struct mq_policy *mq, dm_cblock_t cblock)
-{
-        BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size));
-        BUG_ON(!test_bit(from_cblock(cblock), mq->allocation_bitset));
-        clear_bit(from_cblock(cblock), mq->allocation_bitset);
-        mq->nr_cblocks_allocated--;
-}
 static bool any_free_cblocks(struct mq_policy *mq)
 {
-        return mq->nr_cblocks_allocated < from_cblock(mq->cache_size);
+        return !epool_empty(&mq->cache_pool);
 }
-/*
+static bool any_clean_cblocks(struct mq_policy *mq)
- * Fills result out with a cache block that isn't in use, or return
- * -ENOSPC.  This does _not_ mark the cblock as allocated, the caller is
- * reponsible for that.
- */
-static int __find_free_cblock(struct mq_policy *mq, unsigned begin, unsigned end,
-                              dm_cblock_t *result, unsigned *last_word)
 {
-        int r = -ENOSPC;
+        return !queue_empty(&mq->cache_clean);
-        unsigned w;
-        for (w = begin; w < end; w++) {
-                /*
-                 * ffz is undefined if no zero exists
-                 */
-                if (mq->allocation_bitset[w] != ~0UL) {
-                        *last_word = w;
-                        *result = to_cblock((w * BITS_PER_LONG) + ffz(mq->allocation_bitset[w]));
-                        if (from_cblock(*result) < from_cblock(mq->cache_size))
-                                r = 0;
-                        break;
-                }
-        }
-        return r;
-}
-static int find_free_cblock(struct mq_policy *mq, dm_cblock_t *result)
-{
-        int r;
-        if (!any_free_cblocks(mq))
-                return -ENOSPC;
-        r = __find_free_cblock(mq, mq->find_free_last_word, mq->find_free_nr_words, result, &mq->find_free_last_word);
-        if (r == -ENOSPC && mq->find_free_last_word)
-                r = __find_free_cblock(mq, 0, mq->find_free_last_word, result, &mq->find_free_last_word);
-        return r;
 }
 /*----------------------------------------------------------------*/
@@ -496,33 +462,35 @@ static unsigned queue_level(struct entry *e)
        return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u);
 }
+static bool in_cache(struct mq_policy *mq, struct entry *e)
+{
+        return in_pool(&mq->cache_pool, e);
+}
 /*
 * Inserts the entry into the pre_cache or the cache.  Ensures the cache
- * block is marked as allocated if necc.  Inserts into the hash table.  Sets the
+ * block is marked as allocated if necc.  Inserts into the hash table.
- * tick which records when the entry was last moved about.
+ * Sets the tick which records when the entry was last moved about.
 */
 static void push(struct mq_policy *mq, struct entry *e)
 {
        e->tick = mq->tick;
        hash_insert(mq, e);
-        if (e->in_cache) {
+        if (in_cache(mq, e))
-                alloc_cblock(mq, e->cblock);
+                queue_push(e->dirty ? &mq->cache_dirty : &mq->cache_clean,
-                queue_push(&mq->cache, queue_level(e), &e->list);
+                           queue_level(e), &e->list);
-        } else
+        else
                queue_push(&mq->pre_cache, queue_level(e), &e->list);
 }
 /*
 * Removes an entry from pre_cache or cache.  Removes from the hash table.
- * Frees off the cache block if necc.
 */
 static void del(struct mq_policy *mq, struct entry *e)
 {
        queue_remove(&e->list);
        hash_remove(e);
-        if (e->in_cache)
-                free_cblock(mq, e->cblock);
 }
 /*
@@ -531,14 +499,14 @@ static void del(struct mq_policy *mq, struct entry *e)
 */
 static struct entry *pop(struct mq_policy *mq, struct queue *q)
 {
-        struct entry *e = container_of(queue_pop(q), struct entry, list);
+        struct entry *e;
+        struct list_head *h = queue_pop(q);
-        if (e) {
+        if (!h)
-                hash_remove(e);
+                return NULL;
-                if (e->in_cache)
+        e = container_of(h, struct entry, list);
-                        free_cblock(mq, e->cblock);
+        hash_remove(e);
-        }
        return e;
 }
@@ -556,7 +524,8 @@ static bool updated_this_tick(struct mq_policy *mq, struct entry *e)
 * of the entries.
 *
 * At the moment the threshold is taken by averaging the hit counts of some
- * of the entries in the cache (the first 20 entries of the first level).
+ * of the entries in the cache (the first 20 entries across all levels in
+ * ascending order, giving preference to the clean entries at each level).
 *
 * We can be much cleverer than this though.  For example, each promotion
 * could bump up the threshold helping to prevent churn.  Much more to do
@@ -571,14 +540,21 @@ static void check_generation(struct mq_policy *mq)
        struct list_head *head;
        struct entry *e;
-        if ((mq->hit_count >= mq->generation_period) &&
+        if ((mq->hit_count >= mq->generation_period) && (epool_empty(&mq->cache_pool))) {
-            (mq->nr_cblocks_allocated == from_cblock(mq->cache_size))) {
                mq->hit_count = 0;
                mq->generation++;
                for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) {
-                        head = mq->cache.qs + level;
+                        head = mq->cache_clean.qs + level;
+                        list_for_each_entry(e, head, list) {
+                                nr++;
+                                total += e->hit_count;
+                                if (++count >= MAX_TO_AVERAGE)
+                                        break;
+                        }
+                        head = mq->cache_dirty.qs + level;
                        list_for_each_entry(e, head, list) {
                                nr++;
                                total += e->hit_count;
@@ -631,19 +607,30 @@ static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e)
 * - set the hit count to a hard coded value other than 1, eg, is it better
 *   if it goes in at level 2?
 */
-static dm_cblock_t demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
+static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
 {
-        dm_cblock_t result;
+        struct entry *demoted = pop(mq, &mq->cache_clean);
-        struct entry *demoted = pop(mq, &mq->cache);
+        if (!demoted)
+                /*
+                 * We could get a block from mq->cache_dirty, but that
+                 * would add extra latency to the triggering bio as it
+                 * waits for the writeback.  Better to not promote this
+                 * time and hope there's a clean block next time this block
+                 * is hit.
+                 */
+                return -ENOSPC;
-        BUG_ON(!demoted);
-        result = demoted->cblock;
        *oblock = demoted->oblock;
-        demoted->in_cache = false;
+        free_entry(&mq->cache_pool, demoted);
-        demoted->hit_count = 1;
-        push(mq, demoted);
+        /*
+         * We used to put the demoted block into the pre-cache, but I think
+         * it's simpler to just let it work it's way up from zero again.
+         * Stops blocks flickering in and out of the cache.
+         */
-        return result;
+        return 0;
 }
 /*
@@ -662,17 +649,18 @@ static dm_cblock_t demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
 static unsigned adjusted_promote_threshold(struct mq_policy *mq,
                                           bool discarded_oblock, int data_dir)
 {
-        if (discarded_oblock && any_free_cblocks(mq) && data_dir == WRITE)
+        if (data_dir == READ)
+                return mq->promote_threshold + READ_PROMOTE_THRESHOLD;
+        if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) {
                /*
                 * We don't need to do any copying at all, so give this a
-                 * very low threshold.  In practice this only triggers
+                 * very low threshold.
-                 * during initial population after a format.
                 */
                return DISCARDED_PROMOTE_THRESHOLD;
+        }
-        return data_dir == READ ?
+        return mq->promote_threshold + WRITE_PROMOTE_THRESHOLD;
-                (mq->promote_threshold + READ_PROMOTE_THRESHOLD) :
-                (mq->promote_threshold + WRITE_PROMOTE_THRESHOLD);
 }
 static bool should_promote(struct mq_policy *mq, struct entry *e,
@@ -688,34 +676,49 @@ static int cache_entry_found(struct mq_policy *mq,
 {
        requeue_and_update_tick(mq, e);
-        if (e->in_cache) {
+        if (in_cache(mq, e)) {
                result->op = POLICY_HIT;
-                result->cblock = e->cblock;
+                result->cblock = infer_cblock(&mq->cache_pool, e);
        }
        return 0;
 }
 /*
- * Moves and entry from the pre_cache to the cache.  The main work is
+ * Moves an entry from the pre_cache to the cache.  The main work is
 * finding which cache block to use.
 */
 static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
                              struct policy_result *result)
 {
-        dm_cblock_t cblock;
+        int r;
+        struct entry *new_e;
-        if (find_free_cblock(mq, &cblock) == -ENOSPC) {
+        /* Ensure there's a free cblock in the cache */
+        if (epool_empty(&mq->cache_pool)) {
                result->op = POLICY_REPLACE;
-                cblock = demote_cblock(mq, &result->old_oblock);
+                r = demote_cblock(mq, &result->old_oblock);
+                if (r) {
+                        result->op = POLICY_MISS;
+                        return 0;
+                }
        } else
                result->op = POLICY_NEW;
-        result->cblock = e->cblock = cblock;
+        new_e = alloc_entry(&mq->cache_pool);
+        BUG_ON(!new_e);
+        new_e->oblock = e->oblock;
+        new_e->dirty = false;
+        new_e->hit_count = e->hit_count;
+        new_e->generation = e->generation;
+        new_e->tick = e->tick;
        del(mq, e);
-        e->in_cache = true;
+        free_entry(&mq->pre_cache_pool, e);
-        push(mq, e);
+        push(mq, new_e);
+        result->cblock = infer_cblock(&mq->cache_pool, new_e);
        return 0;
 }
@@ -743,7 +746,7 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
 static void insert_in_pre_cache(struct mq_policy *mq,
                                dm_oblock_t oblock)
 {
-        struct entry *e = alloc_entry(mq);
+        struct entry *e = alloc_entry(&mq->pre_cache_pool);
        if (!e)
                /*
@@ -757,7 +760,7 @@ static void insert_in_pre_cache(struct mq_policy *mq,
                return;
        }
-        e->in_cache = false;
+        e->dirty = false;
        e->oblock = oblock;
        e->hit_count = 1;
        e->generation = mq->generation;
@@ -767,30 +770,36 @@ static void insert_in_pre_cache(struct mq_policy *mq,
 static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
                            struct policy_result *result)
 {
+        int r;
        struct entry *e;
-        dm_cblock_t cblock;
-        if (find_free_cblock(mq, &cblock) == -ENOSPC) {
+        if (epool_empty(&mq->cache_pool)) {
-                result->op = POLICY_MISS;
+                result->op = POLICY_REPLACE;
-                insert_in_pre_cache(mq, oblock);
+                r = demote_cblock(mq, &result->old_oblock);
-                return;
+                if (unlikely(r)) {
-        }
+                        result->op = POLICY_MISS;
+                        insert_in_pre_cache(mq, oblock);
+                        return;
+                }
-        e = alloc_entry(mq);
+                /*
-        if (unlikely(!e)) {
+                 * This will always succeed, since we've just demoted.
-                result->op = POLICY_MISS;
+                 */
-                return;
+                e = alloc_entry(&mq->cache_pool);
+                BUG_ON(!e);
+        } else {
+                e = alloc_entry(&mq->cache_pool);
+                result->op = POLICY_NEW;
        }
        e->oblock = oblock;
-        e->cblock = cblock;
+        e->dirty = false;
-        e->in_cache = true;
        e->hit_count = 1;
        e->generation = mq->generation;
        push(mq, e);
-        result->op = POLICY_NEW;
+        result->cblock = infer_cblock(&mq->cache_pool, e);
-        result->cblock = e->cblock;
 }
 static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock,
@@ -821,13 +830,16 @@ static int map(struct mq_policy *mq, dm_oblock_t oblock,
        int r = 0;
        struct entry *e = hash_lookup(mq, oblock);
-        if (e && e->in_cache)
+        if (e && in_cache(mq, e))
                r = cache_entry_found(mq, e, result);
        else if (iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL)
                result->op = POLICY_MISS;
        else if (e)
                r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock,
                                          data_dir, result);
        else
                r = no_entry_found(mq, oblock, can_migrate, discarded_oblock,
                                   data_dir, result);
@@ -854,9 +866,9 @@ static void mq_destroy(struct dm_cache_policy *p)
 {
        struct mq_policy *mq = to_mq_policy(p);
-        free_bitset(mq->allocation_bitset);
        kfree(mq->table);
-        free_entries(mq);
+        epool_exit(&mq->cache_pool);
+        epool_exit(&mq->pre_cache_pool);
        kfree(mq);
 }
@@ -904,8 +916,8 @@ static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t
                return -EWOULDBLOCK;
        e = hash_lookup(mq, oblock);
-        if (e && e->in_cache) {
+        if (e && in_cache(mq, e)) {
-                *cblock = e->cblock;
+                *cblock = infer_cblock(&mq->cache_pool, e);
                r = 0;
        } else
                r = -ENOENT;
@@ -915,6 +927,36 @@ static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t
        return r;
 }
+static void __mq_set_clear_dirty(struct mq_policy *mq, dm_oblock_t oblock, bool set)
+{
+        struct entry *e;
+        e = hash_lookup(mq, oblock);
+        BUG_ON(!e || !in_cache(mq, e));
+        del(mq, e);
+        e->dirty = set;
+        push(mq, e);
+}
+static void mq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+        struct mq_policy *mq = to_mq_policy(p);
+        mutex_lock(&mq->lock);
+        __mq_set_clear_dirty(mq, oblock, true);
+        mutex_unlock(&mq->lock);
+}
+static void mq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+        struct mq_policy *mq = to_mq_policy(p);
+        mutex_lock(&mq->lock);
+        __mq_set_clear_dirty(mq, oblock, false);
+        mutex_unlock(&mq->lock);
+}
 static int mq_load_mapping(struct dm_cache_policy *p,
                           dm_oblock_t oblock, dm_cblock_t cblock,
                           uint32_t hint, bool hint_valid)
@@ -922,13 +964,9 @@ static int mq_load_mapping(struct dm_cache_policy *p,
        struct mq_policy *mq = to_mq_policy(p);
        struct entry *e;
-        e = alloc_entry(mq);
+        e = alloc_particular_entry(&mq->cache_pool, cblock);
-        if (!e)
-                return -ENOMEM;
-        e->cblock = cblock;
        e->oblock = oblock;
-        e->in_cache = true;
+        e->dirty = false;       /* this gets corrected in a minute */
        e->hit_count = hint_valid ? hint : 1;
        e->generation = mq->generation;
        push(mq, e);
@@ -936,57 +974,126 @@ static int mq_load_mapping(struct dm_cache_policy *p,
        return 0;
 }
+static int mq_save_hints(struct mq_policy *mq, struct queue *q,
+                         policy_walk_fn fn, void *context)
+{
+        int r;
+        unsigned level;
+        struct entry *e;
+        for (level = 0; level < NR_QUEUE_LEVELS; level++)
+                list_for_each_entry(e, q->qs + level, list) {
+                        r = fn(context, infer_cblock(&mq->cache_pool, e),
+                               e->oblock, e->hit_count);
+                        if (r)
+                                return r;
+                }
+        return 0;
+}
 static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn,
                            void *context)
 {
        struct mq_policy *mq = to_mq_policy(p);
        int r = 0;
-        struct entry *e;
-        unsigned level;
        mutex_lock(&mq->lock);
-        for (level = 0; level < NR_QUEUE_LEVELS; level++)
+        r = mq_save_hints(mq, &mq->cache_clean, fn, context);
-                list_for_each_entry(e, &mq->cache.qs[level], list) {
+        if (!r)
-                        r = fn(context, e->cblock, e->oblock, e->hit_count);
+                r = mq_save_hints(mq, &mq->cache_dirty, fn, context);
-                        if (r)
-                                goto out;
-                }
-out:
        mutex_unlock(&mq->lock);
        return r;
 }
+static void __remove_mapping(struct mq_policy *mq, dm_oblock_t oblock)
+{
+        struct entry *e;
+        e = hash_lookup(mq, oblock);
+        BUG_ON(!e || !in_cache(mq, e));
+        del(mq, e);
+        free_entry(&mq->cache_pool, e);
+}
 static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
 {
        struct mq_policy *mq = to_mq_policy(p);
-        struct entry *e;
        mutex_lock(&mq->lock);
+        __remove_mapping(mq, oblock);
+        mutex_unlock(&mq->lock);
+}
-        e = hash_lookup(mq, oblock);
+static int __remove_cblock(struct mq_policy *mq, dm_cblock_t cblock)
+{
+        struct entry *e = epool_find(&mq->cache_pool, cblock);
-        BUG_ON(!e || !e->in_cache);
+        if (!e)
+                return -ENODATA;
        del(mq, e);
-        e->in_cache = false;
+        free_entry(&mq->cache_pool, e);
-        push(mq, e);
+        return 0;
+}
+static int mq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
+{
+        int r;
+        struct mq_policy *mq = to_mq_policy(p);
+        mutex_lock(&mq->lock);
+        r = __remove_cblock(mq, cblock);
        mutex_unlock(&mq->lock);
+        return r;
 }
-static void force_mapping(struct mq_policy *mq,
+static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock,
-                          dm_oblock_t current_oblock, dm_oblock_t new_oblock)
+                              dm_cblock_t *cblock)
 {
-        struct entry *e = hash_lookup(mq, current_oblock);
+        struct entry *e = pop(mq, &mq->cache_dirty);
-        BUG_ON(!e || !e->in_cache);
+        if (!e)
+                return -ENODATA;
-        del(mq, e);
+        *oblock = e->oblock;
-        e->oblock = new_oblock;
+        *cblock = infer_cblock(&mq->cache_pool, e);
+        e->dirty = false;
        push(mq, e);
+        return 0;
+}
+static int mq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
+                             dm_cblock_t *cblock)
+{
+        int r;
+        struct mq_policy *mq = to_mq_policy(p);
+        mutex_lock(&mq->lock);
+        r = __mq_writeback_work(mq, oblock, cblock);
+        mutex_unlock(&mq->lock);
+        return r;
+}
+static void __force_mapping(struct mq_policy *mq,
+                            dm_oblock_t current_oblock, dm_oblock_t new_oblock)
+{
+        struct entry *e = hash_lookup(mq, current_oblock);
+        if (e && in_cache(mq, e)) {
+                del(mq, e);
+                e->oblock = new_oblock;
+                e->dirty = true;
+                push(mq, e);
+        }
 }
 static void mq_force_mapping(struct dm_cache_policy *p,
@@ -995,16 +1102,20 @@ static void mq_force_mapping(struct dm_cache_policy *p,
        struct mq_policy *mq = to_mq_policy(p);
        mutex_lock(&mq->lock);
-        force_mapping(mq, current_oblock, new_oblock);
+        __force_mapping(mq, current_oblock, new_oblock);
        mutex_unlock(&mq->lock);
 }
 static dm_cblock_t mq_residency(struct dm_cache_policy *p)
 {
+        dm_cblock_t r;
        struct mq_policy *mq = to_mq_policy(p);
-        /* FIXME: lock mutex, not sure we can block here */
+        mutex_lock(&mq->lock);
-        return to_cblock(mq->nr_cblocks_allocated);
+        r = to_cblock(mq->cache_pool.nr_allocated);
+        mutex_unlock(&mq->lock);
+        return r;
 }
 static void mq_tick(struct dm_cache_policy *p)
@@ -1057,10 +1168,13 @@ static void init_policy_functions(struct mq_policy *mq)
        mq->policy.destroy = mq_destroy;
        mq->policy.map = mq_map;
        mq->policy.lookup = mq_lookup;
+        mq->policy.set_dirty = mq_set_dirty;
+        mq->policy.clear_dirty = mq_clear_dirty;
        mq->policy.load_mapping = mq_load_mapping;
        mq->policy.walk_mappings = mq_walk_mappings;
        mq->policy.remove_mapping = mq_remove_mapping;
-        mq->policy.writeback_work = NULL;
+        mq->policy.remove_cblock = mq_remove_cblock;
+        mq->policy.writeback_work = mq_writeback_work;
        mq->policy.force_mapping = mq_force_mapping;
        mq->policy.residency = mq_residency;
        mq->policy.tick = mq_tick;
@@ -1072,7 +1186,6 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
                                         sector_t origin_size,
                                         sector_t cache_block_size)
 {
-        int r;
        struct mq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
        if (!mq)
@@ -1080,8 +1193,18 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
        init_policy_functions(mq);
        iot_init(&mq->tracker, SEQUENTIAL_THRESHOLD_DEFAULT, RANDOM_THRESHOLD_DEFAULT);
        mq->cache_size = cache_size;
+        if (epool_init(&mq->pre_cache_pool, from_cblock(cache_size))) {
+                DMERR("couldn't initialize pool of pre-cache entries");
+                goto bad_pre_cache_init;
+        }
+        if (epool_init(&mq->cache_pool, from_cblock(cache_size))) {
+                DMERR("couldn't initialize pool of cache entries");
+                goto bad_cache_init;
+        }
        mq->tick_protected = 0;
        mq->tick = 0;
        mq->hit_count = 0;
@@ -1089,20 +1212,12 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
        mq->promote_threshold = 0;
        mutex_init(&mq->lock);
        spin_lock_init(&mq->tick_lock);
-        mq->find_free_nr_words = dm_div_up(from_cblock(mq->cache_size), BITS_PER_LONG);
-        mq->find_free_last_word = 0;
        queue_init(&mq->pre_cache);
-        queue_init(&mq->cache);
+        queue_init(&mq->cache_clean);
-        mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U);
+        queue_init(&mq->cache_dirty);
-        mq->nr_entries = 2 * from_cblock(cache_size);
+        mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U);
-        r = alloc_entries(mq, mq->nr_entries);
-        if (r)
-                goto bad_cache_alloc;
-        mq->nr_entries_allocated = 0;
-        mq->nr_cblocks_allocated = 0;
        mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16);
        mq->hash_bits = ffs(mq->nr_buckets) - 1;
@@ -1110,17 +1225,13 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
        if (!mq->table)
                goto bad_alloc_table;
-        mq->allocation_bitset = alloc_bitset(from_cblock(cache_size));
-        if (!mq->allocation_bitset)
-                goto bad_alloc_bitset;
        return &mq->policy;
-bad_alloc_bitset:
-        kfree(mq->table);
 bad_alloc_table:
-        free_entries(mq);
+        epool_exit(&mq->cache_pool);
-bad_cache_alloc:
+bad_cache_init:
+        epool_exit(&mq->pre_cache_pool);
+bad_pre_cache_init:
        kfree(mq);
        return NULL;
@@ -1130,7 +1241,7 @@ bad_cache_alloc:
 static struct dm_cache_policy_type mq_policy_type = {
        .name = "mq",
-        .version = {1, 0, 0},
+        .version = {1, 1, 0},
        .hint_size = 4,
        .owner = THIS_MODULE,
        .create = mq_create
@@ -1138,7 +1249,7 @@ static struct dm_cache_policy_type mq_policy_type = {
 static struct dm_cache_policy_type default_policy_type = {
        .name = "default",
-        .version = {1, 0, 0},
+        .version = {1, 1, 0},
        .hint_size = 4,
        .owner = THIS_MODULE,
        .create = mq_create
diff --git a/drivers/md/dm-cache-policy.c b/drivers/md/dm-cache-policy.c
index 21c03c570c06..d80057968407 100644
--- a/drivers/md/dm-cache-policy.c
+++ b/drivers/md/dm-cache-policy.c
@@ -119,13 +119,13 @@ struct dm_cache_policy *dm_cache_policy_create(const char *name,
        type = get_policy(name);
        if (!type) {
                DMWARN("unknown policy type");
-                return NULL;
+                return ERR_PTR(-EINVAL);
        }
        p = type->create(cache_size, origin_size, cache_block_size);
        if (!p) {
                put_policy(type);
-                return NULL;
+                return ERR_PTR(-ENOMEM);
        }
        p->private = type;
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
index 33369ca9614f..052c00a84a5c 100644
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@@ -135,9 +135,6 @@ struct dm_cache_policy {
         */
        int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock);
-        /*
-         * oblock must be a mapped block.  Must not block.
-         */
        void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
        void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
@@ -159,8 +156,24 @@ struct dm_cache_policy {
        void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock,
                              dm_oblock_t new_oblock);
-        int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock);
+        /*
+         * This is called via the invalidate_cblocks message.  It is
+         * possible the particular cblock has already been removed due to a
+         * write io in passthrough mode.  In which case this should return
+         * -ENODATA.
+         */
+        int (*remove_cblock)(struct dm_cache_policy *p, dm_cblock_t cblock);
+        /*
+         * Provide a dirty block to be written back by the core target.
+         *
+         * Returns:
+         *
+         * 0 and @cblock,@oblock: block to write back provided
+         *
+         * -ENODATA: no dirty blocks available
+         */
+        int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock);
        /*
         * How full is the cache?
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 29569768ffbf..9efcf1059b99 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -61,6 +61,34 @@ static void free_bitset(unsigned long *bits)
 /*----------------------------------------------------------------*/
+/*
+ * There are a couple of places where we let a bio run, but want to do some
+ * work before calling its endio function.  We do this by temporarily
+ * changing the endio fn.
+ */
+struct dm_hook_info {
+        bio_end_io_t *bi_end_io;
+        void *bi_private;
+};
+static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
+                        bio_end_io_t *bi_end_io, void *bi_private)
+{
+        h->bi_end_io = bio->bi_end_io;
+        h->bi_private = bio->bi_private;
+        bio->bi_end_io = bi_end_io;
+        bio->bi_private = bi_private;
+}
+static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
+{
+        bio->bi_end_io = h->bi_end_io;
+        bio->bi_private = h->bi_private;
+}
+/*----------------------------------------------------------------*/
 #define PRISON_CELLS 1024
 #define MIGRATION_POOL_SIZE 128
 #define COMMIT_PERIOD HZ
@@ -76,14 +104,37 @@ static void free_bitset(unsigned long *bits)
 /*
 * FIXME: the cache is read/write for the time being.
 */
-enum cache_mode {
+enum cache_metadata_mode {
        CM_WRITE,               /* metadata may be changed */
        CM_READ_ONLY,           /* metadata may not be changed */
 };
+enum cache_io_mode {
+        /*
+         * Data is written to cached blocks only.  These blocks are marked
+         * dirty.  If you lose the cache device you will lose data.
+         * Potential performance increase for both reads and writes.
+         */
+        CM_IO_WRITEBACK,
+        /*
+         * Data is written to both cache and origin.  Blocks are never
+         * dirty.  Potential performance benfit for reads only.
+         */
+        CM_IO_WRITETHROUGH,
+        /*
+         * A degraded mode useful for various cache coherency situations
+         * (eg, rolling back snapshots).  Reads and writes always go to the
+         * origin.  If a write goes to a cached oblock, then the cache
+         * block is invalidated.
+         */
+        CM_IO_PASSTHROUGH
+};
 struct cache_features {
-        enum cache_mode mode;
+        enum cache_metadata_mode mode;
-        bool write_through:1;
+        enum cache_io_mode io_mode;
 };
 struct cache_stats {
@@ -99,6 +150,25 @@ struct cache_stats {
        atomic_t discard_count;
 };
+/*
+ * Defines a range of cblocks, begin to (end - 1) are in the range.  end is
+ * the one-past-the-end value.
+ */
+struct cblock_range {
+        dm_cblock_t begin;
+        dm_cblock_t end;
+};
+struct invalidation_request {
+        struct list_head list;
+        struct cblock_range *cblocks;
+        atomic_t complete;
+        int err;
+        wait_queue_head_t result_wait;
+};
 struct cache {
        struct dm_target *ti;
        struct dm_target_callbacks callbacks;
@@ -148,6 +218,10 @@ struct cache {
        wait_queue_head_t migration_wait;
        atomic_t nr_migrations;
+        wait_queue_head_t quiescing_wait;
+        atomic_t quiescing;
+        atomic_t quiescing_ack;
        /*
         * cache_size entries, dirty if set
         */
@@ -186,7 +260,7 @@ struct cache {
        bool need_tick_bio:1;
        bool sized:1;
-        bool quiescing:1;
+        bool invalidate:1;
        bool commit_requested:1;
        bool loaded_mappings:1;
        bool loaded_discards:1;
@@ -197,6 +271,12 @@ struct cache {
        struct cache_features features;
        struct cache_stats stats;
+        /*
+         * Invalidation fields.
+         */
+        spinlock_t invalidation_lock;
+        struct list_head invalidation_requests;
 };
 struct per_bio_data {
@@ -211,7 +291,7 @@ struct per_bio_data {
         */
        struct cache *cache;
        dm_cblock_t cblock;
-        bio_end_io_t *saved_bi_end_io;
+        struct dm_hook_info hook_info;
        struct dm_bio_details bio_details;
 };
@@ -228,6 +308,8 @@ struct dm_cache_migration {
        bool writeback:1;
        bool demote:1;
        bool promote:1;
+        bool requeue_holder:1;
+        bool invalidate:1;
        struct dm_bio_prison_cell *old_ocell;
        struct dm_bio_prison_cell *new_ocell;
@@ -533,9 +615,24 @@ static void save_stats(struct cache *cache)
 #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
 #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
+static bool writethrough_mode(struct cache_features *f)
+{
+        return f->io_mode == CM_IO_WRITETHROUGH;
+}
+static bool writeback_mode(struct cache_features *f)
+{
+        return f->io_mode == CM_IO_WRITEBACK;
+}
+static bool passthrough_mode(struct cache_features *f)
+{
+        return f->io_mode == CM_IO_PASSTHROUGH;
+}
 static size_t get_per_bio_data_size(struct cache *cache)
 {
-        return cache->features.write_through ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
+        return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
 }
 static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
@@ -605,6 +702,7 @@ static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
                                 dm_oblock_t oblock, dm_cblock_t cblock)
 {
+        check_if_tick_bio_needed(cache, bio);
        remap_to_cache(cache, bio, cblock);
        if (bio_data_dir(bio) == WRITE) {
                set_dirty(cache, oblock, cblock);
@@ -662,7 +760,8 @@ static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
 static void writethrough_endio(struct bio *bio, int err)
 {
        struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
-        bio->bi_end_io = pb->saved_bi_end_io;
+        dm_unhook_bio(&pb->hook_info, bio);
        if (err) {
                bio_endio(bio, err);
@@ -693,9 +792,8 @@ static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
        pb->cache = cache;
        pb->cblock = cblock;
-        pb->saved_bi_end_io = bio->bi_end_io;
+        dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL);
        dm_bio_record(&pb->bio_details, bio);
-        bio->bi_end_io = writethrough_endio;
        remap_to_origin_clear_discard(pb->cache, bio, oblock);
 }
@@ -748,8 +846,9 @@ static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
 static void cleanup_migration(struct dm_cache_migration *mg)
 {
-        dec_nr_migrations(mg->cache);
+        struct cache *cache = mg->cache;
        free_migration(mg);
+        dec_nr_migrations(cache);
 }
 static void migration_failure(struct dm_cache_migration *mg)
@@ -765,13 +864,13 @@ static void migration_failure(struct dm_cache_migration *mg)
                DMWARN_LIMIT("demotion failed; couldn't copy block");
                policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
-                cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
+                cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
                if (mg->promote)
-                        cell_defer(cache, mg->new_ocell, 1);
+                        cell_defer(cache, mg->new_ocell, true);
        } else {
                DMWARN_LIMIT("promotion failed; couldn't copy block");
                policy_remove_mapping(cache->policy, mg->new_oblock);
-                cell_defer(cache, mg->new_ocell, 1);
+                cell_defer(cache, mg->new_ocell, true);
        }
        cleanup_migration(mg);
@@ -823,7 +922,7 @@ static void migration_success_post_commit(struct dm_cache_migration *mg)
                return;
        } else if (mg->demote) {
-                cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
+                cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
                if (mg->promote) {
                        mg->demote = false;
@@ -832,11 +931,19 @@ static void migration_success_post_commit(struct dm_cache_migration *mg)
                        list_add_tail(&mg->list, &cache->quiesced_migrations);
                        spin_unlock_irqrestore(&cache->lock, flags);
-                } else
+                } else {
+                        if (mg->invalidate)
+                                policy_remove_mapping(cache->policy, mg->old_oblock);
                        cleanup_migration(mg);
+                }
        } else {
-                cell_defer(cache, mg->new_ocell, true);
+                if (mg->requeue_holder)
+                        cell_defer(cache, mg->new_ocell, true);
+                else {
+                        bio_endio(mg->new_ocell->holder, 0);
+                        cell_defer(cache, mg->new_ocell, false);
+                }
                clear_dirty(cache, mg->new_oblock, mg->cblock);
                cleanup_migration(mg);
        }
@@ -881,8 +988,46 @@ static void issue_copy_real(struct dm_cache_migration *mg)
                r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
        }
-        if (r < 0)
+        if (r < 0) {
+                DMERR_LIMIT("issuing migration failed");
                migration_failure(mg);
+        }
+}
+static void overwrite_endio(struct bio *bio, int err)
+{
+        struct dm_cache_migration *mg = bio->bi_private;
+        struct cache *cache = mg->cache;
+        size_t pb_data_size = get_per_bio_data_size(cache);
+        struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+        unsigned long flags;
+        if (err)
+                mg->err = true;
+        spin_lock_irqsave(&cache->lock, flags);
+        list_add_tail(&mg->list, &cache->completed_migrations);
+        dm_unhook_bio(&pb->hook_info, bio);
+        mg->requeue_holder = false;
+        spin_unlock_irqrestore(&cache->lock, flags);
+        wake_worker(cache);
+}
+static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
+{
+        size_t pb_data_size = get_per_bio_data_size(mg->cache);
+        struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+        dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
+        remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock);
+        generic_make_request(bio);
+}
+static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
+{
+        return (bio_data_dir(bio) == WRITE) &&
+                (bio->bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
 }
 static void avoid_copy(struct dm_cache_migration *mg)
@@ -899,9 +1044,17 @@ static void issue_copy(struct dm_cache_migration *mg)
        if (mg->writeback || mg->demote)
                avoid = !is_dirty(cache, mg->cblock) ||
                        is_discarded_oblock(cache, mg->old_oblock);
-        else
+        else {
+                struct bio *bio = mg->new_ocell->holder;
                avoid = is_discarded_oblock(cache, mg->new_oblock);
+                if (!avoid && bio_writes_complete_block(cache, bio)) {
+                        issue_overwrite(mg, bio);
+                        return;
+                }
+        }
        avoid ? avoid_copy(mg) : issue_copy_real(mg);
 }
@@ -991,6 +1144,8 @@ static void promote(struct cache *cache, struct prealloc *structs,
        mg->writeback = false;
        mg->demote = false;
        mg->promote = true;
+        mg->requeue_holder = true;
+        mg->invalidate = false;
        mg->cache = cache;
        mg->new_oblock = oblock;
        mg->cblock = cblock;
@@ -1012,6 +1167,8 @@ static void writeback(struct cache *cache, struct prealloc *structs,
        mg->writeback = true;
        mg->demote = false;
        mg->promote = false;
+        mg->requeue_holder = true;
+        mg->invalidate = false;
        mg->cache = cache;
        mg->old_oblock = oblock;
        mg->cblock = cblock;
@@ -1035,6 +1192,8 @@ static void demote_then_promote(struct cache *cache, struct prealloc *structs,
        mg->writeback = false;
        mg->demote = true;
        mg->promote = true;
+        mg->requeue_holder = true;
+        mg->invalidate = false;
        mg->cache = cache;
        mg->old_oblock = old_oblock;
        mg->new_oblock = new_oblock;
@@ -1047,6 +1206,33 @@ static void demote_then_promote(struct cache *cache, struct prealloc *structs,
        quiesce_migration(mg);
 }
+/*
+ * Invalidate a cache entry.  No writeback occurs; any changes in the cache
+ * block are thrown away.
+ */
+static void invalidate(struct cache *cache, struct prealloc *structs,
+                       dm_oblock_t oblock, dm_cblock_t cblock,
+                       struct dm_bio_prison_cell *cell)
+{
+        struct dm_cache_migration *mg = prealloc_get_migration(structs);
+        mg->err = false;
+        mg->writeback = false;
+        mg->demote = true;
+        mg->promote = false;
+        mg->requeue_holder = true;
+        mg->invalidate = true;
+        mg->cache = cache;
+        mg->old_oblock = oblock;
+        mg->cblock = cblock;
+        mg->old_ocell = cell;
+        mg->new_ocell = NULL;
+        mg->start_jiffies = jiffies;
+        inc_nr_migrations(cache);
+        quiesce_migration(mg);
+}
 /*----------------------------------------------------------------
 * bio processing
 *--------------------------------------------------------------*/
@@ -1109,13 +1295,6 @@ static bool spare_migration_bandwidth(struct cache *cache)
        return current_volume < cache->migration_threshold;
 }
-static bool is_writethrough_io(struct cache *cache, struct bio *bio,
-                               dm_cblock_t cblock)
-{
-        return bio_data_dir(bio) == WRITE &&
-                cache->features.write_through && !is_dirty(cache, cblock);
-}
 static void inc_hit_counter(struct cache *cache, struct bio *bio)
 {
        atomic_inc(bio_data_dir(bio) == READ ?
@@ -1128,6 +1307,15 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio)
                   &cache->stats.read_miss : &cache->stats.write_miss);
 }
+static void issue_cache_bio(struct cache *cache, struct bio *bio,
+                            struct per_bio_data *pb,
+                            dm_oblock_t oblock, dm_cblock_t cblock)
+{
+        pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+        remap_to_cache_dirty(cache, bio, oblock, cblock);
+        issue(cache, bio);
+}
 static void process_bio(struct cache *cache, struct prealloc *structs,
                        struct bio *bio)
 {
@@ -1139,7 +1327,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
        size_t pb_data_size = get_per_bio_data_size(cache);
        struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
        bool discarded_block = is_discarded_oblock(cache, block);
-        bool can_migrate = discarded_block || spare_migration_bandwidth(cache);
+        bool passthrough = passthrough_mode(&cache->features);
+        bool can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
        /*
         * Check to see if that block is currently migrating.
@@ -1160,15 +1349,39 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
        switch (lookup_result.op) {
        case POLICY_HIT:
-                inc_hit_counter(cache, bio);
+                if (passthrough) {
-                pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+                        inc_miss_counter(cache, bio);
-                if (is_writethrough_io(cache, bio, lookup_result.cblock))
+                        /*
-                        remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
+                         * Passthrough always maps to the origin,
-                else
+                         * invalidating any cache blocks that are written
-                        remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
+                         * to.
+                         */
+                        if (bio_data_dir(bio) == WRITE) {
+                                atomic_inc(&cache->stats.demotion);
+                                invalidate(cache, structs, block, lookup_result.cblock, new_ocell);
+                                release_cell = false;
+                        } else {
+                                /* FIXME: factor out issue_origin() */
+                                pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+                                remap_to_origin_clear_discard(cache, bio, block);
+                                issue(cache, bio);
+                        }
+                } else {
+                        inc_hit_counter(cache, bio);
+                        if (bio_data_dir(bio) == WRITE &&
+                            writethrough_mode(&cache->features) &&
+                            !is_dirty(cache, lookup_result.cblock)) {
+                                pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+                                remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
+                                issue(cache, bio);
+                        } else
+                                issue_cache_bio(cache, bio, pb, block, lookup_result.cblock);
+                }
-                issue(cache, bio);
                break;
        case POLICY_MISS:
@@ -1227,15 +1440,17 @@ static int need_commit_due_to_time(struct cache *cache)
 static int commit_if_needed(struct cache *cache)
 {
-        if (dm_cache_changed_this_transaction(cache->cmd) &&
+        int r = 0;
-            (cache->commit_requested || need_commit_due_to_time(cache))) {
+        if ((cache->commit_requested || need_commit_due_to_time(cache)) &&
+            dm_cache_changed_this_transaction(cache->cmd)) {
                atomic_inc(&cache->stats.commit_count);
-                cache->last_commit_jiffies = jiffies;
                cache->commit_requested = false;
-                return dm_cache_commit(cache->cmd, false);
+                r = dm_cache_commit(cache->cmd, false);
+                cache->last_commit_jiffies = jiffies;
        }
-        return 0;
+        return r;
 }
 static void process_deferred_bios(struct cache *cache)
@@ -1344,36 +1559,88 @@ static void writeback_some_dirty_blocks(struct cache *cache)
 }
 /*----------------------------------------------------------------
- * Main worker loop
+ * Invalidations.
+ * Dropping something from the cache *without* writing back.
 *--------------------------------------------------------------*/
-static void start_quiescing(struct cache *cache)
+static void process_invalidation_request(struct cache *cache, struct invalidation_request *req)
 {
-        unsigned long flags;
+        int r = 0;
+        uint64_t begin = from_cblock(req->cblocks->begin);
+        uint64_t end = from_cblock(req->cblocks->end);
-        spin_lock_irqsave(&cache->lock, flags);
+        while (begin != end) {
-        cache->quiescing = 1;
+                r = policy_remove_cblock(cache->policy, to_cblock(begin));
-        spin_unlock_irqrestore(&cache->lock, flags);
+                if (!r) {
+                        r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin));
+                        if (r)
+                                break;
+                } else if (r == -ENODATA) {
+                        /* harmless, already unmapped */
+                        r = 0;
+                } else {
+                        DMERR("policy_remove_cblock failed");
+                        break;
+                }
+                begin++;
+        }
+        cache->commit_requested = true;
+        req->err = r;
+        atomic_set(&req->complete, 1);
+        wake_up(&req->result_wait);
 }
-static void stop_quiescing(struct cache *cache)
+static void process_invalidation_requests(struct cache *cache)
 {
-        unsigned long flags;
+        struct list_head list;
+        struct invalidation_request *req, *tmp;
-        spin_lock_irqsave(&cache->lock, flags);
+        INIT_LIST_HEAD(&list);
-        cache->quiescing = 0;
+        spin_lock(&cache->invalidation_lock);
-        spin_unlock_irqrestore(&cache->lock, flags);
+        list_splice_init(&cache->invalidation_requests, &list);
+        spin_unlock(&cache->invalidation_lock);
+        list_for_each_entry_safe (req, tmp, &list, list)
+                process_invalidation_request(cache, req);
 }
+/*----------------------------------------------------------------
+ * Main worker loop
+ *--------------------------------------------------------------*/
 static bool is_quiescing(struct cache *cache)
 {
-        int r;
+        return atomic_read(&cache->quiescing);
-        unsigned long flags;
+}
-        spin_lock_irqsave(&cache->lock, flags);
+static void ack_quiescing(struct cache *cache)
-        r = cache->quiescing;
+{
-        spin_unlock_irqrestore(&cache->lock, flags);
+        if (is_quiescing(cache)) {
+                atomic_inc(&cache->quiescing_ack);
+                wake_up(&cache->quiescing_wait);
+        }
+}
-        return r;
+static void wait_for_quiescing_ack(struct cache *cache)
+{
+        wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack));
+}
+static void start_quiescing(struct cache *cache)
+{
+        atomic_inc(&cache->quiescing);
+        wait_for_quiescing_ack(cache);
+}
+static void stop_quiescing(struct cache *cache)
+{
+        atomic_set(&cache->quiescing, 0);
+        atomic_set(&cache->quiescing_ack, 0);
 }
 static void wait_for_migrations(struct cache *cache)
@@ -1412,7 +1679,8 @@ static int more_work(struct cache *cache)
                        !bio_list_empty(&cache->deferred_writethrough_bios) ||
                        !list_empty(&cache->quiesced_migrations) ||
                        !list_empty(&cache->completed_migrations) ||
-                        !list_empty(&cache->need_commit_migrations);
+                        !list_empty(&cache->need_commit_migrations) ||
+                        cache->invalidate;
 }
 static void do_worker(struct work_struct *ws)
@@ -1420,16 +1688,16 @@ static void do_worker(struct work_struct *ws)
        struct cache *cache = container_of(ws, struct cache, worker);
        do {
-                if (!is_quiescing(cache))
+                if (!is_quiescing(cache)) {
+                        writeback_some_dirty_blocks(cache);
+                        process_deferred_writethrough_bios(cache);
                        process_deferred_bios(cache);
+                        process_invalidation_requests(cache);
+                }
                process_migrations(cache, &cache->quiesced_migrations, issue_copy);
                process_migrations(cache, &cache->completed_migrations, complete_migration);
-                writeback_some_dirty_blocks(cache);
-                process_deferred_writethrough_bios(cache);
                if (commit_if_needed(cache)) {
                        process_deferred_flush_bios(cache, false);
@@ -1442,6 +1710,9 @@ static void do_worker(struct work_struct *ws)
                        process_migrations(cache, &cache->need_commit_migrations,
                                           migration_success_post_commit);
                }
+                ack_quiescing(cache);
        } while (more_work(cache));
 }
@@ -1715,7 +1986,7 @@ static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
 static void init_features(struct cache_features *cf)
 {
        cf->mode = CM_WRITE;
-        cf->write_through = false;
+        cf->io_mode = CM_IO_WRITEBACK;
 }
 static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
@@ -1740,10 +2011,13 @@ static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
                arg = dm_shift_arg(as);
                if (!strcasecmp(arg, "writeback"))
-                        cf->write_through = false;
+                        cf->io_mode = CM_IO_WRITEBACK;
                else if (!strcasecmp(arg, "writethrough"))
-                        cf->write_through = true;
+                        cf->io_mode = CM_IO_WRITETHROUGH;
+                else if (!strcasecmp(arg, "passthrough"))
+                        cf->io_mode = CM_IO_PASSTHROUGH;
                else {
                        *error = "Unrecognised cache feature requested";
@@ -1872,14 +2146,15 @@ static int set_config_values(struct cache *cache, int argc, const char **argv)
 static int create_cache_policy(struct cache *cache, struct cache_args *ca,
                               char **error)
 {
-        cache->policy = dm_cache_policy_create(ca->policy_name,
+        struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name,
-                                               cache->cache_size,
+                                                           cache->cache_size,
-                                               cache->origin_sectors,
+                                                           cache->origin_sectors,
-                                               cache->sectors_per_block);
+                                                           cache->sectors_per_block);
-        if (!cache->policy) {
+        if (IS_ERR(p)) {
                *error = "Error creating cache's policy";
-                return -ENOMEM;
+                return PTR_ERR(p);
        }
+        cache->policy = p;
        return 0;
 }
@@ -1995,6 +2270,22 @@ static int cache_create(struct cache_args *ca, struct cache **result)
        }
        cache->cmd = cmd;
+        if (passthrough_mode(&cache->features)) {
+                bool all_clean;
+                r = dm_cache_metadata_all_clean(cache->cmd, &all_clean);
+                if (r) {
+                        *error = "dm_cache_metadata_all_clean() failed";
+                        goto bad;
+                }
+                if (!all_clean) {
+                        *error = "Cannot enter passthrough mode unless all blocks are clean";
+                        r = -EINVAL;
+                        goto bad;
+                }
+        }
        spin_lock_init(&cache->lock);
        bio_list_init(&cache->deferred_bios);
        bio_list_init(&cache->deferred_flush_bios);
@@ -2005,6 +2296,10 @@ static int cache_create(struct cache_args *ca, struct cache **result)
        atomic_set(&cache->nr_migrations, 0);
        init_waitqueue_head(&cache->migration_wait);
+        init_waitqueue_head(&cache->quiescing_wait);
+        atomic_set(&cache->quiescing, 0);
+        atomic_set(&cache->quiescing_ack, 0);
        r = -ENOMEM;
        cache->nr_dirty = 0;
        cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
@@ -2064,7 +2359,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
        cache->need_tick_bio = true;
        cache->sized = false;
-        cache->quiescing = false;
+        cache->invalidate = false;
        cache->commit_requested = false;
        cache->loaded_mappings = false;
        cache->loaded_discards = false;
@@ -2078,6 +2373,9 @@ static int cache_create(struct cache_args *ca, struct cache **result)
        atomic_set(&cache->stats.commit_count, 0);
        atomic_set(&cache->stats.discard_count, 0);
+        spin_lock_init(&cache->invalidation_lock);
+        INIT_LIST_HEAD(&cache->invalidation_requests);
        *result = cache;
        return 0;
@@ -2207,17 +2505,37 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
                return DM_MAPIO_SUBMITTED;
        }
+        r = DM_MAPIO_REMAPPED;
        switch (lookup_result.op) {
        case POLICY_HIT:
-                inc_hit_counter(cache, bio);
+                if (passthrough_mode(&cache->features)) {
-                pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+                        if (bio_data_dir(bio) == WRITE) {
+                                /*
+                                 * We need to invalidate this block, so
+                                 * defer for the worker thread.
+                                 */
+                                cell_defer(cache, cell, true);
+                                r = DM_MAPIO_SUBMITTED;
+                        } else {
+                                pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+                                inc_miss_counter(cache, bio);
+                                remap_to_origin_clear_discard(cache, bio, block);
+                                cell_defer(cache, cell, false);
+                        }
-                if (is_writethrough_io(cache, bio, lookup_result.cblock))
+                } else {
-                        remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
+                        inc_hit_counter(cache, bio);
-                else
-                        remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
-                cell_defer(cache, cell, false);
+                        if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
+                            !is_dirty(cache, lookup_result.cblock))
+                                remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
+                        else
+                                remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
+                        cell_defer(cache, cell, false);
+                }
                break;
        case POLICY_MISS:
@@ -2242,10 +2560,10 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
                DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
                            (unsigned) lookup_result.op);
                bio_io_error(bio);
-                return DM_MAPIO_SUBMITTED;
+                r = DM_MAPIO_SUBMITTED;
        }
-        return DM_MAPIO_REMAPPED;
+        return r;
 }
 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
@@ -2406,26 +2724,71 @@ static int load_discard(void *context, sector_t discard_block_size,
        return 0;
 }
+static dm_cblock_t get_cache_dev_size(struct cache *cache)
+{
+        sector_t size = get_dev_size(cache->cache_dev);
+        (void) sector_div(size, cache->sectors_per_block);
+        return to_cblock(size);
+}
+static bool can_resize(struct cache *cache, dm_cblock_t new_size)
+{
+        if (from_cblock(new_size) > from_cblock(cache->cache_size))
+                return true;
+        /*
+         * We can't drop a dirty block when shrinking the cache.
+         */
+        while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
+                new_size = to_cblock(from_cblock(new_size) + 1);
+                if (is_dirty(cache, new_size)) {
+                        DMERR("unable to shrink cache; cache block %llu is dirty",
+                              (unsigned long long) from_cblock(new_size));
+                        return false;
+                }
+        }
+        return true;
+}
+static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
+{
+        int r;
+        r = dm_cache_resize(cache->cmd, cache->cache_size);
+        if (r) {
+                DMERR("could not resize cache metadata");
+                return r;
+        }
+        cache->cache_size = new_size;
+        return 0;
+}
 static int cache_preresume(struct dm_target *ti)
 {
        int r = 0;
        struct cache *cache = ti->private;
-        sector_t actual_cache_size = get_dev_size(cache->cache_dev);
+        dm_cblock_t csize = get_cache_dev_size(cache);
-        (void) sector_div(actual_cache_size, cache->sectors_per_block);
        /*
         * Check to see if the cache has resized.
         */
-        if (from_cblock(cache->cache_size) != actual_cache_size || !cache->sized) {
+        if (!cache->sized) {
-                cache->cache_size = to_cblock(actual_cache_size);
+                r = resize_cache_dev(cache, csize);
+                if (r)
-                r = dm_cache_resize(cache->cmd, cache->cache_size);
-                if (r) {
-                        DMERR("could not resize cache metadata");
                        return r;
-                }
                cache->sized = true;
+        } else if (csize != cache->cache_size) {
+                if (!can_resize(cache, csize))
+                        return -EINVAL;
+                r = resize_cache_dev(cache, csize);
+                if (r)
+                        return r;
        }
        if (!cache->loaded_mappings) {
@@ -2518,10 +2881,19 @@ static void cache_status(struct dm_target *ti, status_type_t type,
                       (unsigned long long) from_cblock(residency),
                       cache->nr_dirty);
-                if (cache->features.write_through)
+                if (writethrough_mode(&cache->features))
                        DMEMIT("1 writethrough ");
-                else
-                        DMEMIT("0 ");
+                else if (passthrough_mode(&cache->features))
+                        DMEMIT("1 passthrough ");
+                else if (writeback_mode(&cache->features))
+                        DMEMIT("1 writeback ");
+                else {
+                        DMERR("internal error: unknown io mode: %d", (int) cache->features.io_mode);
+                        goto err;
+                }
                DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
                if (sz < maxlen) {
@@ -2553,7 +2925,128 @@ err:
 }
 /*
- * Supports <key> <value>.
+ * A cache block range can take two forms:
+ *
+ * i) A single cblock, eg. '3456'
+ * ii) A begin and end cblock with dots between, eg. 123-234
+ */
+static int parse_cblock_range(struct cache *cache, const char *str,
+                              struct cblock_range *result)
+{
+        char dummy;
+        uint64_t b, e;
+        int r;
+        /*
+         * Try and parse form (ii) first.
+         */
+        r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy);
+        if (r < 0)
+                return r;
+        if (r == 2) {
+                result->begin = to_cblock(b);
+                result->end = to_cblock(e);
+                return 0;
+        }
+        /*
+         * That didn't work, try form (i).
+         */
+        r = sscanf(str, "%llu%c", &b, &dummy);
+        if (r < 0)
+                return r;
+        if (r == 1) {
+                result->begin = to_cblock(b);
+                result->end = to_cblock(from_cblock(result->begin) + 1u);
+                return 0;
+        }
+        DMERR("invalid cblock range '%s'", str);
+        return -EINVAL;
+}
+static int validate_cblock_range(struct cache *cache, struct cblock_range *range)
+{
+        uint64_t b = from_cblock(range->begin);
+        uint64_t e = from_cblock(range->end);
+        uint64_t n = from_cblock(cache->cache_size);
+        if (b >= n) {
+                DMERR("begin cblock out of range: %llu >= %llu", b, n);
+                return -EINVAL;
+        }
+        if (e > n) {
+                DMERR("end cblock out of range: %llu > %llu", e, n);
+                return -EINVAL;
+        }
+        if (b >= e) {
+                DMERR("invalid cblock range: %llu >= %llu", b, e);
+                return -EINVAL;
+        }
+        return 0;
+}
+static int request_invalidation(struct cache *cache, struct cblock_range *range)
+{
+        struct invalidation_request req;
+        INIT_LIST_HEAD(&req.list);
+        req.cblocks = range;
+        atomic_set(&req.complete, 0);
+        req.err = 0;
+        init_waitqueue_head(&req.result_wait);
+        spin_lock(&cache->invalidation_lock);
+        list_add(&req.list, &cache->invalidation_requests);
+        spin_unlock(&cache->invalidation_lock);
+        wake_worker(cache);
+        wait_event(req.result_wait, atomic_read(&req.complete));
+        return req.err;
+}
+static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
+                                              const char **cblock_ranges)
+{
+        int r = 0;
+        unsigned i;
+        struct cblock_range range;
+        if (!passthrough_mode(&cache->features)) {
+                DMERR("cache has to be in passthrough mode for invalidation");
+                return -EPERM;
+        }
+        for (i = 0; i < count; i++) {
+                r = parse_cblock_range(cache, cblock_ranges[i], &range);
+                if (r)
+                        break;
+                r = validate_cblock_range(cache, &range);
+                if (r)
+                        break;
+                /*
+                 * Pass begin and end origin blocks to the worker and wake it.
+                 */
+                r = request_invalidation(cache, &range);
+                if (r)
+                        break;
+        }
+        return r;
+}
+/*
+ * Supports
+ *      "<key> <value>"
+ * and
+ *     "invalidate_cblocks [(<begin>)|(<begin>-<end>)]*
 *
 * The key migration_threshold is supported by the cache target core.
 */
@@ -2561,6 +3054,12 @@ static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
 {
        struct cache *cache = ti->private;
+        if (!argc)
+                return -EINVAL;
+        if (!strcasecmp(argv[0], "invalidate_cblocks"))
+                return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
        if (argc != 2)
                return -EINVAL;
@@ -2630,7 +3129,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
 static struct target_type cache_target = {
        .name = "cache",
-        .version = {1, 1, 1},
+        .version = {1, 2, 0},
        .module = THIS_MODULE,
        .ctr = cache_ctr,
        .dtr = cache_dtr,
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 0fce0bc1a957..50ea7ed24dce 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -2,6 +2,7 @@
 * Copyright (C) 2003 Christophe Saout <christophe@saout.de>
 * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
 * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2013 Milan Broz <gmazyland@gmail.com>
 *
 * This file is released under the GPL.
 */
@@ -98,6 +99,13 @@ struct iv_lmk_private {
        u8 *seed;
 };
+#define TCW_WHITENING_SIZE 16
+struct iv_tcw_private {
+        struct crypto_shash *crc32_tfm;
+        u8 *iv_seed;
+        u8 *whitening;
+};
 /*
 * Crypt: maps a linear range of a block device
 * and encrypts / decrypts at the same time.
@@ -139,6 +147,7 @@ struct crypt_config {
                struct iv_essiv_private essiv;
                struct iv_benbi_private benbi;
                struct iv_lmk_private lmk;
+                struct iv_tcw_private tcw;
        } iv_gen_private;
        sector_t iv_offset;
        unsigned int iv_size;
@@ -171,7 +180,8 @@ struct crypt_config {
        unsigned long flags;
        unsigned int key_size;
-        unsigned int key_parts;
+        unsigned int key_parts;      /* independent parts in key buffer */
+        unsigned int key_extra_size; /* additional keys length */
        u8 key[0];
 };
@@ -230,6 +240,16 @@ static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
 *         version 3: the same as version 2 with additional IV seed
 *                   (it uses 65 keys, last key is used as IV seed)
 *
+ * tcw:  Compatible implementation of the block chaining mode used
+ *       by the TrueCrypt device encryption system (prior to version 4.1).
+ *       For more info see: http://www.truecrypt.org
+ *       It operates on full 512 byte sectors and uses CBC
+ *       with an IV derived from initial key and the sector number.
+ *       In addition, whitening value is applied on every sector, whitening
+ *       is calculated from initial key, sector number and mixed using CRC32.
+ *       Note that this encryption scheme is vulnerable to watermarking attacks
+ *       and should be used for old compatible containers access only.
+ *
 * plumb: unimplemented, see:
 * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
 */
@@ -530,7 +550,7 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
                char ctx[crypto_shash_descsize(lmk->hash_tfm)];
        } sdesc;
        struct md5_state md5state;
-        u32 buf[4];
+        __le32 buf[4];
        int i, r;
        sdesc.desc.tfm = lmk->hash_tfm;
@@ -608,6 +628,153 @@ static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
        return r;
 }
+static void crypt_iv_tcw_dtr(struct crypt_config *cc)
+{
+        struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+        kzfree(tcw->iv_seed);
+        tcw->iv_seed = NULL;
+        kzfree(tcw->whitening);
+        tcw->whitening = NULL;
+        if (tcw->crc32_tfm && !IS_ERR(tcw->crc32_tfm))
+                crypto_free_shash(tcw->crc32_tfm);
+        tcw->crc32_tfm = NULL;
+}
+static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti,
+                            const char *opts)
+{
+        struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+        if (cc->key_size <= (cc->iv_size + TCW_WHITENING_SIZE)) {
+                ti->error = "Wrong key size for TCW";
+                return -EINVAL;
+        }
+        tcw->crc32_tfm = crypto_alloc_shash("crc32", 0, 0);
+        if (IS_ERR(tcw->crc32_tfm)) {
+                ti->error = "Error initializing CRC32 in TCW";
+                return PTR_ERR(tcw->crc32_tfm);
+        }
+        tcw->iv_seed = kzalloc(cc->iv_size, GFP_KERNEL);
+        tcw->whitening = kzalloc(TCW_WHITENING_SIZE, GFP_KERNEL);
+        if (!tcw->iv_seed || !tcw->whitening) {
+                crypt_iv_tcw_dtr(cc);
+                ti->error = "Error allocating seed storage in TCW";
+                return -ENOMEM;
+        }
+        return 0;
+}
+static int crypt_iv_tcw_init(struct crypt_config *cc)
+{
+        struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+        int key_offset = cc->key_size - cc->iv_size - TCW_WHITENING_SIZE;
+        memcpy(tcw->iv_seed, &cc->key[key_offset], cc->iv_size);
+        memcpy(tcw->whitening, &cc->key[key_offset + cc->iv_size],
+               TCW_WHITENING_SIZE);
+        return 0;
+}
+static int crypt_iv_tcw_wipe(struct crypt_config *cc)
+{
+        struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+        memset(tcw->iv_seed, 0, cc->iv_size);
+        memset(tcw->whitening, 0, TCW_WHITENING_SIZE);
+        return 0;
+}
+static int crypt_iv_tcw_whitening(struct crypt_config *cc,
+                                  struct dm_crypt_request *dmreq,
+                                  u8 *data)
+{
+        struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+        u64 sector = cpu_to_le64((u64)dmreq->iv_sector);
+        u8 buf[TCW_WHITENING_SIZE];
+        struct {
+                struct shash_desc desc;
+                char ctx[crypto_shash_descsize(tcw->crc32_tfm)];
+        } sdesc;
+        int i, r;
+        /* xor whitening with sector number */
+        memcpy(buf, tcw->whitening, TCW_WHITENING_SIZE);
+        crypto_xor(buf, (u8 *)&sector, 8);
+        crypto_xor(&buf[8], (u8 *)&sector, 8);
+        /* calculate crc32 for every 32bit part and xor it */
+        sdesc.desc.tfm = tcw->crc32_tfm;
+        sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+        for (i = 0; i < 4; i++) {
+                r = crypto_shash_init(&sdesc.desc);
+                if (r)
+                        goto out;
+                r = crypto_shash_update(&sdesc.desc, &buf[i * 4], 4);
+                if (r)
+                        goto out;
+                r = crypto_shash_final(&sdesc.desc, &buf[i * 4]);
+                if (r)
+                        goto out;
+        }
+        crypto_xor(&buf[0], &buf[12], 4);
+        crypto_xor(&buf[4], &buf[8], 4);
+        /* apply whitening (8 bytes) to whole sector */
+        for (i = 0; i < ((1 << SECTOR_SHIFT) / 8); i++)
+                crypto_xor(data + i * 8, buf, 8);
+out:
+        memset(buf, 0, sizeof(buf));
+        return r;
+}
+static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
+                            struct dm_crypt_request *dmreq)
+{
+        struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+        u64 sector = cpu_to_le64((u64)dmreq->iv_sector);
+        u8 *src;
+        int r = 0;
+        /* Remove whitening from ciphertext */
+        if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) {
+                src = kmap_atomic(sg_page(&dmreq->sg_in));
+                r = crypt_iv_tcw_whitening(cc, dmreq, src + dmreq->sg_in.offset);
+                kunmap_atomic(src);
+        }
+        /* Calculate IV */
+        memcpy(iv, tcw->iv_seed, cc->iv_size);
+        crypto_xor(iv, (u8 *)&sector, 8);
+        if (cc->iv_size > 8)
+                crypto_xor(&iv[8], (u8 *)&sector, cc->iv_size - 8);
+        return r;
+}
+static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv,
+                             struct dm_crypt_request *dmreq)
+{
+        u8 *dst;
+        int r;
+        if (bio_data_dir(dmreq->ctx->bio_in) != WRITE)
+                return 0;
+        /* Apply whitening on ciphertext */
+        dst = kmap_atomic(sg_page(&dmreq->sg_out));
+        r = crypt_iv_tcw_whitening(cc, dmreq, dst + dmreq->sg_out.offset);
+        kunmap_atomic(dst);
+        return r;
+}
 static struct crypt_iv_operations crypt_iv_plain_ops = {
        .generator = crypt_iv_plain_gen
 };
@@ -643,6 +810,15 @@ static struct crypt_iv_operations crypt_iv_lmk_ops = {
        .post      = crypt_iv_lmk_post
 };
+static struct crypt_iv_operations crypt_iv_tcw_ops = {
+        .ctr       = crypt_iv_tcw_ctr,
+        .dtr       = crypt_iv_tcw_dtr,
+        .init      = crypt_iv_tcw_init,
+        .wipe      = crypt_iv_tcw_wipe,
+        .generator = crypt_iv_tcw_gen,
+        .post      = crypt_iv_tcw_post
+};
 static void crypt_convert_init(struct crypt_config *cc,
                               struct convert_context *ctx,
                               struct bio *bio_out, struct bio *bio_in,
@@ -1274,9 +1450,12 @@ static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
 static int crypt_setkey_allcpus(struct crypt_config *cc)
 {
-        unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count);
+        unsigned subkey_size;
        int err = 0, i, r;
+        /* Ignore extra keys (which are used for IV etc) */
+        subkey_size = (cc->key_size - cc->key_extra_size) >> ilog2(cc->tfms_count);
        for (i = 0; i < cc->tfms_count; i++) {
                r = crypto_ablkcipher_setkey(cc->tfms[i],
                                             cc->key + (i * subkey_size),
@@ -1409,6 +1588,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
                return -EINVAL;
        }
        cc->key_parts = cc->tfms_count;
+        cc->key_extra_size = 0;
        cc->cipher = kstrdup(cipher, GFP_KERNEL);
        if (!cc->cipher)
@@ -1460,13 +1640,6 @@ static int crypt_ctr_cipher(struct dm_target *ti,
                goto bad;
        }
-        /* Initialize and set key */
-        ret = crypt_set_key(cc, key);
-        if (ret < 0) {
-                ti->error = "Error decoding and setting key";
-                goto bad;
-        }
        /* Initialize IV */
        cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc));
        if (cc->iv_size)
@@ -1493,18 +1666,33 @@ static int crypt_ctr_cipher(struct dm_target *ti,
                cc->iv_gen_ops = &crypt_iv_null_ops;
        else if (strcmp(ivmode, "lmk") == 0) {
                cc->iv_gen_ops = &crypt_iv_lmk_ops;
-                /* Version 2 and 3 is recognised according
+                /*
+                 * Version 2 and 3 is recognised according
                 * to length of provided multi-key string.
                 * If present (version 3), last key is used as IV seed.
+                 * All keys (including IV seed) are always the same size.
                 */
-                if (cc->key_size % cc->key_parts)
+                if (cc->key_size % cc->key_parts) {
                        cc->key_parts++;
+                        cc->key_extra_size = cc->key_size / cc->key_parts;
+                }
+        } else if (strcmp(ivmode, "tcw") == 0) {
+                cc->iv_gen_ops = &crypt_iv_tcw_ops;
+                cc->key_parts += 2; /* IV + whitening */
+                cc->key_extra_size = cc->iv_size + TCW_WHITENING_SIZE;
        } else {
                ret = -EINVAL;
                ti->error = "Invalid IV mode";
                goto bad;
        }
+        /* Initialize and set key */
+        ret = crypt_set_key(cc, key);
+        if (ret < 0) {
+                ti->error = "Error decoding and setting key";
+                goto bad;
+        }
        /* Allocate IV */
        if (cc->iv_gen_ops && cc->iv_gen_ops->ctr) {
                ret = cc->iv_gen_ops->ctr(cc, ti, ivopts);
@@ -1817,7 +2005,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
 static struct target_type crypt_target = {
        .name   = "crypt",
-        .version = {1, 12, 1},
+        .version = {1, 13, 0},
        .module = THIS_MODULE,
        .ctr    = crypt_ctr,
        .dtr    = crypt_dtr,
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index afe08146f73e..51521429fb59 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -57,7 +57,7 @@ struct vers_iter {
 static struct list_head _name_buckets[NUM_BUCKETS];
 static struct list_head _uuid_buckets[NUM_BUCKETS];
-static void dm_hash_remove_all(int keep_open_devices);
+static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred);
 /*
 * Guards access to both hash tables.
@@ -86,7 +86,7 @@ static int dm_hash_init(void)
 static void dm_hash_exit(void)
 {
-        dm_hash_remove_all(0);
+        dm_hash_remove_all(false, false, false);
 }
 /*-----------------------------------------------------------------
@@ -276,7 +276,7 @@ static struct dm_table *__hash_remove(struct hash_cell *hc)
        return table;
 }
-static void dm_hash_remove_all(int keep_open_devices)
+static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred)
 {
        int i, dev_skipped;
        struct hash_cell *hc;
@@ -293,7 +293,8 @@ retry:
                        md = hc->md;
                        dm_get(md);
-                        if (keep_open_devices && dm_lock_for_deletion(md)) {
+                        if (keep_open_devices &&
+                            dm_lock_for_deletion(md, mark_deferred, only_deferred)) {
                                dm_put(md);
                                dev_skipped++;
                                continue;
@@ -450,6 +451,11 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
        return md;
 }
+void dm_deferred_remove(void)
+{
+        dm_hash_remove_all(true, false, true);
+}
 /*-----------------------------------------------------------------
 * Implementation of the ioctl commands
 *---------------------------------------------------------------*/
@@ -461,7 +467,7 @@ typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size);
 static int remove_all(struct dm_ioctl *param, size_t param_size)
 {
-        dm_hash_remove_all(1);
+        dm_hash_remove_all(true, !!(param->flags & DM_DEFERRED_REMOVE), false);
        param->data_size = 0;
        return 0;
 }
@@ -683,6 +689,9 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param)
        if (dm_suspended_md(md))
                param->flags |= DM_SUSPEND_FLAG;
+        if (dm_test_deferred_remove_flag(md))
+                param->flags |= DM_DEFERRED_REMOVE;
        param->dev = huge_encode_dev(disk_devt(disk));
        /*
@@ -832,8 +841,13 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
        /*
         * Ensure the device is not open and nothing further can open it.
         */
-        r = dm_lock_for_deletion(md);
+        r = dm_lock_for_deletion(md, !!(param->flags & DM_DEFERRED_REMOVE), false);
        if (r) {
+                if (r == -EBUSY && param->flags & DM_DEFERRED_REMOVE) {
+                        up_write(&_hash_lock);
+                        dm_put(md);
+                        return 0;
+                }
                DMDEBUG_LIMIT("unable to remove open device %s", hc->name);
                up_write(&_hash_lock);
                dm_put(md);
@@ -848,6 +862,8 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
                dm_table_destroy(t);
        }
+        param->flags &= ~DM_DEFERRED_REMOVE;
        if (!dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr))
                param->flags |= DM_UEVENT_GENERATED_FLAG;
@@ -1469,6 +1485,14 @@ static int message_for_md(struct mapped_device *md, unsigned argc, char **argv,
        if (**argv != '@')
                return 2; /* no '@' prefix, deliver to target */
+        if (!strcasecmp(argv[0], "@cancel_deferred_remove")) {
+                if (argc != 1) {
+                        DMERR("Invalid arguments for @cancel_deferred_remove");
+                        return -EINVAL;
+                }
+                return dm_cancel_deferred_remove(md);
+        }
        r = dm_stats_message(md, argc, argv, result, maxlen);
        if (r < 2)
                return r;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index de570a558764..6eb9dc9ef8f3 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -87,6 +87,7 @@ struct multipath {
        unsigned queue_if_no_path:1;    /* Queue I/O if last path fails? */
        unsigned saved_queue_if_no_path:1; /* Saved state during suspension */
        unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */
+        unsigned pg_init_disabled:1;    /* pg_init is not currently allowed */
        unsigned pg_init_retries;       /* Number of times to retry pg_init */
        unsigned pg_init_count;         /* Number of times pg_init called */
@@ -390,13 +391,16 @@ static int map_io(struct multipath *m, struct request *clone,
        if (was_queued)
                m->queue_size--;
-        if ((pgpath && m->queue_io) ||
+        if (m->pg_init_required) {
-            (!pgpath && m->queue_if_no_path)) {
+                if (!m->pg_init_in_progress)
+                        queue_work(kmultipathd, &m->process_queued_ios);
+                r = DM_MAPIO_REQUEUE;
+        } else if ((pgpath && m->queue_io) ||
+                   (!pgpath && m->queue_if_no_path)) {
                /* Queue for the daemon to resubmit */
                list_add_tail(&clone->queuelist, &m->queued_ios);
                m->queue_size++;
-                if ((m->pg_init_required && !m->pg_init_in_progress) ||
+                if (!m->queue_io)
-                    !m->queue_io)
                        queue_work(kmultipathd, &m->process_queued_ios);
                pgpath = NULL;
                r = DM_MAPIO_SUBMITTED;
@@ -497,7 +501,8 @@ static void process_queued_ios(struct work_struct *work)
            (!pgpath && !m->queue_if_no_path))
                must_queue = 0;
-        if (m->pg_init_required && !m->pg_init_in_progress && pgpath)
+        if (m->pg_init_required && !m->pg_init_in_progress && pgpath &&
+            !m->pg_init_disabled)
                __pg_init_all_paths(m);
        spin_unlock_irqrestore(&m->lock, flags);
@@ -942,10 +947,20 @@ static void multipath_wait_for_pg_init_completion(struct multipath *m)
 static void flush_multipath_work(struct multipath *m)
 {
+        unsigned long flags;
+        spin_lock_irqsave(&m->lock, flags);
+        m->pg_init_disabled = 1;
+        spin_unlock_irqrestore(&m->lock, flags);
        flush_workqueue(kmpath_handlerd);
        multipath_wait_for_pg_init_completion(m);
        flush_workqueue(kmultipathd);
        flush_work(&m->trigger_event);
+        spin_lock_irqsave(&m->lock, flags);
+        m->pg_init_disabled = 0;
+        spin_unlock_irqrestore(&m->lock, flags);
 }
 static void multipath_dtr(struct dm_target *ti)
@@ -1164,7 +1179,7 @@ static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
        spin_lock_irqsave(&m->lock, flags);
-        if (m->pg_init_count <= m->pg_init_retries)
+        if (m->pg_init_count <= m->pg_init_retries && !m->pg_init_disabled)
                m->pg_init_required = 1;
        else
                limit_reached = 1;
@@ -1665,6 +1680,11 @@ static int multipath_busy(struct dm_target *ti)
        spin_lock_irqsave(&m->lock, flags);
+        /* pg_init in progress, requeue until done */
+        if (m->pg_init_in_progress) {
+                busy = 1;
+                goto out;
+        }
        /* Guess which priority_group will be used at next mapping time */
        if (unlikely(!m->current_pgpath && m->next_pg))
                pg = m->next_pg;
@@ -1714,7 +1734,7 @@ out:
 *---------------------------------------------------------------*/
 static struct target_type multipath_target = {
        .name = "multipath",
-        .version = {1, 5, 1},
+        .version = {1, 6, 0},
        .module = THIS_MODULE,
        .ctr = multipath_ctr,
        .dtr = multipath_dtr,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 8f8783533ac7..465f08ca62b1 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -545,14 +545,28 @@ static int adjoin(struct dm_table *table, struct dm_target *ti)
 /*
 * Used to dynamically allocate the arg array.
+ *
+ * We do first allocation with GFP_NOIO because dm-mpath and dm-thin must
+ * process messages even if some device is suspended. These messages have a
+ * small fixed number of arguments.
+ *
+ * On the other hand, dm-switch needs to process bulk data using messages and
+ * excessive use of GFP_NOIO could cause trouble.
 */
 static char **realloc_argv(unsigned *array_size, char **old_argv)
 {
        char **argv;
        unsigned new_size;
+        gfp_t gfp;
-        new_size = *array_size ? *array_size * 2 : 64;
+        if (*array_size) {
-        argv = kmalloc(new_size * sizeof(*argv), GFP_KERNEL);
+                new_size = *array_size * 2;
+                gfp = GFP_KERNEL;
+        } else {
+                new_size = 8;
+                gfp = GFP_NOIO;
+        }
+        argv = kmalloc(new_size * sizeof(*argv), gfp);
        if (argv) {
                memcpy(argv, old_argv, *array_size * sizeof(*argv));
                *array_size = new_size;
@@ -1548,8 +1562,11 @@ int dm_table_resume_targets(struct dm_table *t)
                        continue;
                r = ti->type->preresume(ti);
-                if (r)
+                if (r) {
+                        DMERR("%s: %s: preresume failed, error = %d",
+                              dm_device_name(t->md), ti->type->name, r);
                        return r;
+                }
        }
        for (i = 0; i < t->num_targets; i++) {
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index b3e26c7d1417..0704c523a76b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -49,6 +49,11 @@ static unsigned int _major = 0;
 static DEFINE_IDR(_minor_idr);
 static DEFINE_SPINLOCK(_minor_lock);
+static void do_deferred_remove(struct work_struct *w);
+static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
 /*
 * For bio-based dm.
 * One of these is allocated per bio.
@@ -116,6 +121,7 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
 #define DMF_DELETING 4
 #define DMF_NOFLUSH_SUSPENDING 5
 #define DMF_MERGE_IS_OPTIONAL 6
+#define DMF_DEFERRED_REMOVE 7
 /*
 * A dummy definition to make RCU happy.
@@ -299,6 +305,8 @@ out_free_io_cache:
 static void local_exit(void)
 {
+        flush_scheduled_work();
        kmem_cache_destroy(_rq_tio_cache);
        kmem_cache_destroy(_io_cache);
        unregister_blkdev(_major, _name);
@@ -404,7 +412,10 @@ static void dm_blk_close(struct gendisk *disk, fmode_t mode)
        spin_lock(&_minor_lock);
-        atomic_dec(&md->open_count);
+        if (atomic_dec_and_test(&md->open_count) &&
+            (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
+                schedule_work(&deferred_remove_work);
        dm_put(md);
        spin_unlock(&_minor_lock);
@@ -418,14 +429,18 @@ int dm_open_count(struct mapped_device *md)
 /*
 * Guarantees nothing is using the device before it's deleted.
 */
-int dm_lock_for_deletion(struct mapped_device *md)
+int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
 {
        int r = 0;
        spin_lock(&_minor_lock);
-        if (dm_open_count(md))
+        if (dm_open_count(md)) {
                r = -EBUSY;
+                if (mark_deferred)
+                        set_bit(DMF_DEFERRED_REMOVE, &md->flags);
+        } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
+                r = -EEXIST;
        else
                set_bit(DMF_DELETING, &md->flags);
@@ -434,6 +449,27 @@ int dm_lock_for_deletion(struct mapped_device *md)
        return r;
 }
+int dm_cancel_deferred_remove(struct mapped_device *md)
+{
+        int r = 0;
+        spin_lock(&_minor_lock);
+        if (test_bit(DMF_DELETING, &md->flags))
+                r = -EBUSY;
+        else
+                clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
+        spin_unlock(&_minor_lock);
+        return r;
+}
+static void do_deferred_remove(struct work_struct *w)
+{
+        dm_deferred_remove();
+}
 sector_t dm_get_size(struct mapped_device *md)
 {
        return get_capacity(md->disk);
@@ -2894,6 +2930,11 @@ int dm_suspended_md(struct mapped_device *md)
        return test_bit(DMF_SUSPENDED, &md->flags);
 }
+int dm_test_deferred_remove_flag(struct mapped_device *md)
+{
+        return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
+}
 int dm_suspended(struct dm_target *ti)
 {
        return dm_suspended_md(dm_table_get_md(ti->table));
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 1d1ad7b7e527..c57ba550f69e 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -129,6 +129,16 @@ int dm_deleting_md(struct mapped_device *md);
 int dm_suspended_md(struct mapped_device *md);
 /*
+ * Test if the device is scheduled for deferred remove.
+ */
+int dm_test_deferred_remove_flag(struct mapped_device *md);
+/*
+ * Try to remove devices marked for deferred removal.
+ */
+void dm_deferred_remove(void);
+/*
 * The device-mapper can be driven through one of two interfaces;
 * ioctl or filesystem, depending which patch you have applied.
 */
@@ -158,7 +168,8 @@ void dm_stripe_exit(void);
 void dm_destroy(struct mapped_device *md);
 void dm_destroy_immediate(struct mapped_device *md);
 int dm_open_count(struct mapped_device *md);
-int dm_lock_for_deletion(struct mapped_device *md);
+int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred);
+int dm_cancel_deferred_remove(struct mapped_device *md);
 int dm_request_based(struct mapped_device *md);
 sector_t dm_get_size(struct mapped_device *md);
 struct dm_stats *dm_get_stats(struct mapped_device *md);
diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
index 172147eb1d40..af96e24ec328 100644
--- a/drivers/md/persistent-data/dm-array.c
+++ b/drivers/md/persistent-data/dm-array.c
@@ -509,15 +509,18 @@ static int grow_add_tail_block(struct resize *resize)
 static int grow_needs_more_blocks(struct resize *resize)
 {
        int r;
+        unsigned old_nr_blocks = resize->old_nr_full_blocks;
        if (resize->old_nr_entries_in_last_block > 0) {
+                old_nr_blocks++;
                r = grow_extend_tail_block(resize, resize->max_entries);
                if (r)
                        return r;
        }
        r = insert_full_ablocks(resize->info, resize->size_of_block,
-                                resize->old_nr_full_blocks,
+                                old_nr_blocks,
                                resize->new_nr_full_blocks,
                                resize->max_entries, resize->value,
                                &resize->root);
diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c
index e735a6d5a793..cfbf9617e465 100644
--- a/drivers/md/persistent-data/dm-space-map-disk.c
+++ b/drivers/md/persistent-data/dm-space-map-disk.c
@@ -140,26 +140,10 @@ static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b)
 static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b)
 {
-        int r;
-        uint32_t old_count;
        enum allocation_event ev;
        struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
-        r = sm_ll_dec(&smd->ll, b, &ev);
+        return sm_ll_dec(&smd->ll, b, &ev);
-        if (!r && (ev == SM_FREE)) {
-                /*
-                 * It's only free if it's also free in the last
-                 * transaction.
-                 */
-                r = sm_ll_lookup(&smd->old_ll, b, &old_count);
-                if (r)
-                        return r;
-                if (!old_count)
-                        smd->nr_allocated_this_transaction--;
-        }
-        return r;
 }
 static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b)