12 files changed, 4475 insertions, 0 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 7cdf359d6b23..1a4fbcdb5ca2 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -268,6 +268,19 @@ config DM_DEBUG_BLOCK_STACK_TRACING
          If unsure, say N.
+config DM_CACHE
+       tristate "Cache target (EXPERIMENTAL)"
+       depends on BLK_DEV_DM
+       default n
+       select DM_PERSISTENT_DATA
+       select DM_BIO_PRISON
+       ---help---
+         dm-cache attempts to improve performance of a block device by
+         moving frequently used data to a smaller, higher performance
+         device.  Different 'policy' plugins can be used to change the
+         algorithms used to select which blocks are promoted, demoted,
+         cleaned etc.  It supports writeback and writethrough modes.
 config DM_MIRROR
       tristate "Mirror target"
       depends on BLK_DEV_DM
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 94dce8b49324..24b52560f4d2 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -11,6 +11,7 @@ dm-mirror-y	+= dm-raid1.o
 dm-log-userspace-y \
                += dm-log-userspace-base.o dm-log-userspace-transfer.o
 dm-thin-pool-y  += dm-thin.o dm-thin-metadata.o
+dm-cache-y      += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o
 md-mod-y        += md.o bitmap.o
 raid456-y       += raid5.o
@@ -44,6 +45,7 @@ obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
 obj-$(CONFIG_DM_RAID)   += dm-raid.o
 obj-$(CONFIG_DM_THIN_PROVISIONING)      += dm-thin-pool.o
 obj-$(CONFIG_DM_VERITY)         += dm-verity.o
+obj-$(CONFIG_DM_CACHE)          += dm-cache.o
 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs                     += dm-uevent.o
diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c
index 144067c95aba..85f0b7074257 100644
--- a/drivers/md/dm-bio-prison.c
+++ b/drivers/md/dm-bio-prison.c
@@ -179,6 +179,15 @@ int dm_bio_detain(struct dm_bio_prison *prison,
 }
 EXPORT_SYMBOL_GPL(dm_bio_detain);
+int dm_get_cell(struct dm_bio_prison *prison,
+                struct dm_cell_key *key,
+                struct dm_bio_prison_cell *cell_prealloc,
+                struct dm_bio_prison_cell **cell_result)
+{
+        return bio_detain(prison, key, NULL, cell_prealloc, cell_result);
+}
+EXPORT_SYMBOL_GPL(dm_get_cell);
 /*
 * @inmates must have been initialised prior to this call
 */
diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h
index 981a02d3a055..3f833190eadf 100644
--- a/drivers/md/dm-bio-prison.h
+++ b/drivers/md/dm-bio-prison.h
@@ -57,6 +57,17 @@ void dm_bio_prison_free_cell(struct dm_bio_prison *prison,
                             struct dm_bio_prison_cell *cell);
 /*
+ * Creates, or retrieves a cell for the given key.
+ *
+ * Returns 1 if pre-existing cell returned, zero if new cell created using
+ * @cell_prealloc.
+ */
+int dm_get_cell(struct dm_bio_prison *prison,
+                struct dm_cell_key *key,
+                struct dm_bio_prison_cell *cell_prealloc,
+                struct dm_bio_prison_cell **cell_result);
+/*
 * An atomic op that combines retrieving a cell, and adding a bio to it.
 *
 * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
diff --git a/drivers/md/dm-cache-block-types.h b/drivers/md/dm-cache-block-types.h
new file mode 100644
index 000000000000..bed4ad4e1b7c
--- /dev/null
+++ b/drivers/md/dm-cache-block-types.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+#ifndef DM_CACHE_BLOCK_TYPES_H
+#define DM_CACHE_BLOCK_TYPES_H
+#include "persistent-data/dm-block-manager.h"
+/*----------------------------------------------------------------*/
+/*
+ * It's helpful to get sparse to differentiate between indexes into the
+ * origin device, indexes into the cache device, and indexes into the
+ * discard bitset.
+ */
+typedef dm_block_t __bitwise__ dm_oblock_t;
+typedef uint32_t __bitwise__ dm_cblock_t;
+typedef dm_block_t __bitwise__ dm_dblock_t;
+static inline dm_oblock_t to_oblock(dm_block_t b)
+{
+        return (__force dm_oblock_t) b;
+}
+static inline dm_block_t from_oblock(dm_oblock_t b)
+{
+        return (__force dm_block_t) b;
+}
+static inline dm_cblock_t to_cblock(uint32_t b)
+{
+        return (__force dm_cblock_t) b;
+}
+static inline uint32_t from_cblock(dm_cblock_t b)
+{
+        return (__force uint32_t) b;
+}
+static inline dm_dblock_t to_dblock(dm_block_t b)
+{
+        return (__force dm_dblock_t) b;
+}
+static inline dm_block_t from_dblock(dm_dblock_t b)
+{
+        return (__force dm_block_t) b;
+}
+#endif /* DM_CACHE_BLOCK_TYPES_H */
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
new file mode 100644
index 000000000000..fbd3625f2748
--- /dev/null
+++ b/drivers/md/dm-cache-metadata.c
@@ -0,0 +1,1146 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+#include "dm-cache-metadata.h"
+#include "persistent-data/dm-array.h"
+#include "persistent-data/dm-bitset.h"
+#include "persistent-data/dm-space-map.h"
+#include "persistent-data/dm-space-map-disk.h"
+#include "persistent-data/dm-transaction-manager.h"
+#include <linux/device-mapper.h>
+/*----------------------------------------------------------------*/
+#define DM_MSG_PREFIX   "cache metadata"
+#define CACHE_SUPERBLOCK_MAGIC 06142003
+#define CACHE_SUPERBLOCK_LOCATION 0
+#define CACHE_VERSION 1
+#define CACHE_METADATA_CACHE_SIZE 64
+/*
+ *  3 for btree insert +
+ *  2 for btree lookup used within space map
+ */
+#define CACHE_MAX_CONCURRENT_LOCKS 5
+#define SPACE_MAP_ROOT_SIZE 128
+enum superblock_flag_bits {
+        /* for spotting crashes that would invalidate the dirty bitset */
+        CLEAN_SHUTDOWN,
+};
+/*
+ * Each mapping from cache block -> origin block carries a set of flags.
+ */
+enum mapping_bits {
+        /*
+         * A valid mapping.  Because we're using an array we clear this
+         * flag for an non existant mapping.
+         */
+        M_VALID = 1,
+        /*
+         * The data on the cache is different from that on the origin.
+         */
+        M_DIRTY = 2
+};
+struct cache_disk_superblock {
+        __le32 csum;
+        __le32 flags;
+        __le64 blocknr;
+        __u8 uuid[16];
+        __le64 magic;
+        __le32 version;
+        __u8 policy_name[CACHE_POLICY_NAME_SIZE];
+        __le32 policy_hint_size;
+        __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
+        __le64 mapping_root;
+        __le64 hint_root;
+        __le64 discard_root;
+        __le64 discard_block_size;
+        __le64 discard_nr_blocks;
+        __le32 data_block_size;
+        __le32 metadata_block_size;
+        __le32 cache_blocks;
+        __le32 compat_flags;
+        __le32 compat_ro_flags;
+        __le32 incompat_flags;
+        __le32 read_hits;
+        __le32 read_misses;
+        __le32 write_hits;
+        __le32 write_misses;
+} __packed;
+struct dm_cache_metadata {
+        struct block_device *bdev;
+        struct dm_block_manager *bm;
+        struct dm_space_map *metadata_sm;
+        struct dm_transaction_manager *tm;
+        struct dm_array_info info;
+        struct dm_array_info hint_info;
+        struct dm_disk_bitset discard_info;
+        struct rw_semaphore root_lock;
+        dm_block_t root;
+        dm_block_t hint_root;
+        dm_block_t discard_root;
+        sector_t discard_block_size;
+        dm_dblock_t discard_nr_blocks;
+        sector_t data_block_size;
+        dm_cblock_t cache_blocks;
+        bool changed:1;
+        bool clean_when_opened:1;
+        char policy_name[CACHE_POLICY_NAME_SIZE];
+        size_t policy_hint_size;
+        struct dm_cache_statistics stats;
+};
+/*-------------------------------------------------------------------
+ * superblock validator
+ *-----------------------------------------------------------------*/
+#define SUPERBLOCK_CSUM_XOR 9031977
+static void sb_prepare_for_write(struct dm_block_validator *v,
+                                 struct dm_block *b,
+                                 size_t sb_block_size)
+{
+        struct cache_disk_superblock *disk_super = dm_block_data(b);
+        disk_super->blocknr = cpu_to_le64(dm_block_location(b));
+        disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
+                                                      sb_block_size - sizeof(__le32),
+                                                      SUPERBLOCK_CSUM_XOR));
+}
+static int sb_check(struct dm_block_validator *v,
+                    struct dm_block *b,
+                    size_t sb_block_size)
+{
+        struct cache_disk_superblock *disk_super = dm_block_data(b);
+        __le32 csum_le;
+        if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) {
+                DMERR("sb_check failed: blocknr %llu: wanted %llu",
+                      le64_to_cpu(disk_super->blocknr),
+                      (unsigned long long)dm_block_location(b));
+                return -ENOTBLK;
+        }
+        if (le64_to_cpu(disk_super->magic) != CACHE_SUPERBLOCK_MAGIC) {
+                DMERR("sb_check failed: magic %llu: wanted %llu",
+                      le64_to_cpu(disk_super->magic),
+                      (unsigned long long)CACHE_SUPERBLOCK_MAGIC);
+                return -EILSEQ;
+        }
+        csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
+                                             sb_block_size - sizeof(__le32),
+                                             SUPERBLOCK_CSUM_XOR));
+        if (csum_le != disk_super->csum) {
+                DMERR("sb_check failed: csum %u: wanted %u",
+                      le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum));
+                return -EILSEQ;
+        }
+        return 0;
+}
+static struct dm_block_validator sb_validator = {
+        .name = "superblock",
+        .prepare_for_write = sb_prepare_for_write,
+        .check = sb_check
+};
+/*----------------------------------------------------------------*/
+static int superblock_read_lock(struct dm_cache_metadata *cmd,
+                                struct dm_block **sblock)
+{
+        return dm_bm_read_lock(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
+                               &sb_validator, sblock);
+}
+static int superblock_lock_zero(struct dm_cache_metadata *cmd,
+                                struct dm_block **sblock)
+{
+        return dm_bm_write_lock_zero(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
+                                     &sb_validator, sblock);
+}
+static int superblock_lock(struct dm_cache_metadata *cmd,
+                           struct dm_block **sblock)
+{
+        return dm_bm_write_lock(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
+                                &sb_validator, sblock);
+}
+/*----------------------------------------------------------------*/
+static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
+{
+        int r;
+        unsigned i;
+        struct dm_block *b;
+        __le64 *data_le, zero = cpu_to_le64(0);
+        unsigned sb_block_size = dm_bm_block_size(bm) / sizeof(__le64);
+        /*
+         * We can't use a validator here - it may be all zeroes.
+         */
+        r = dm_bm_read_lock(bm, CACHE_SUPERBLOCK_LOCATION, NULL, &b);
+        if (r)
+                return r;
+        data_le = dm_block_data(b);
+        *result = 1;
+        for (i = 0; i < sb_block_size; i++) {
+                if (data_le[i] != zero) {
+                        *result = 0;
+                        break;
+                }
+        }
+        return dm_bm_unlock(b);
+}
+static void __setup_mapping_info(struct dm_cache_metadata *cmd)
+{
+        struct dm_btree_value_type vt;
+        vt.context = NULL;
+        vt.size = sizeof(__le64);
+        vt.inc = NULL;
+        vt.dec = NULL;
+        vt.equal = NULL;
+        dm_array_info_init(&cmd->info, cmd->tm, &vt);
+        if (cmd->policy_hint_size) {
+                vt.size = sizeof(__le32);
+                dm_array_info_init(&cmd->hint_info, cmd->tm, &vt);
+        }
+}
+static int __write_initial_superblock(struct dm_cache_metadata *cmd)
+{
+        int r;
+        struct dm_block *sblock;
+        size_t metadata_len;
+        struct cache_disk_superblock *disk_super;
+        sector_t bdev_size = i_size_read(cmd->bdev->bd_inode) >> SECTOR_SHIFT;
+        /* FIXME: see if we can lose the max sectors limit */
+        if (bdev_size > DM_CACHE_METADATA_MAX_SECTORS)
+                bdev_size = DM_CACHE_METADATA_MAX_SECTORS;
+        r = dm_sm_root_size(cmd->metadata_sm, &metadata_len);
+        if (r < 0)
+                return r;
+        r = dm_tm_pre_commit(cmd->tm);
+        if (r < 0)
+                return r;
+        r = superblock_lock_zero(cmd, &sblock);
+        if (r)
+                return r;
+        disk_super = dm_block_data(sblock);
+        disk_super->flags = 0;
+        memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
+        disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC);
+        disk_super->version = cpu_to_le32(CACHE_VERSION);
+        memset(disk_super->policy_name, 0, CACHE_POLICY_NAME_SIZE);
+        disk_super->policy_hint_size = 0;
+        r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root,
+                            metadata_len);
+        if (r < 0)
+                goto bad_locked;
+        disk_super->mapping_root = cpu_to_le64(cmd->root);
+        disk_super->hint_root = cpu_to_le64(cmd->hint_root);
+        disk_super->discard_root = cpu_to_le64(cmd->discard_root);
+        disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
+        disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks));
+        disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
+        disk_super->data_block_size = cpu_to_le32(cmd->data_block_size);
+        disk_super->cache_blocks = cpu_to_le32(0);
+        memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name));
+        disk_super->read_hits = cpu_to_le32(0);
+        disk_super->read_misses = cpu_to_le32(0);
+        disk_super->write_hits = cpu_to_le32(0);
+        disk_super->write_misses = cpu_to_le32(0);
+        return dm_tm_commit(cmd->tm, sblock);
+bad_locked:
+        dm_bm_unlock(sblock);
+        return r;
+}
+static int __format_metadata(struct dm_cache_metadata *cmd)
+{
+        int r;
+        r = dm_tm_create_with_sm(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
+                                 &cmd->tm, &cmd->metadata_sm);
+        if (r < 0) {
+                DMERR("tm_create_with_sm failed");
+                return r;
+        }
+        __setup_mapping_info(cmd);
+        r = dm_array_empty(&cmd->info, &cmd->root);
+        if (r < 0)
+                goto bad;
+        dm_disk_bitset_init(cmd->tm, &cmd->discard_info);
+        r = dm_bitset_empty(&cmd->discard_info, &cmd->discard_root);
+        if (r < 0)
+                goto bad;
+        cmd->discard_block_size = 0;
+        cmd->discard_nr_blocks = 0;
+        r = __write_initial_superblock(cmd);
+        if (r)
+                goto bad;
+        cmd->clean_when_opened = true;
+        return 0;
+bad:
+        dm_tm_destroy(cmd->tm);
+        dm_sm_destroy(cmd->metadata_sm);
+        return r;
+}
+static int __check_incompat_features(struct cache_disk_superblock *disk_super,
+                                     struct dm_cache_metadata *cmd)
+{
+        uint32_t features;
+        features = le32_to_cpu(disk_super->incompat_flags) & ~DM_CACHE_FEATURE_INCOMPAT_SUPP;
+        if (features) {
+                DMERR("could not access metadata due to unsupported optional features (%lx).",
+                      (unsigned long)features);
+                return -EINVAL;
+        }
+        /*
+         * Check for read-only metadata to skip the following RDWR checks.
+         */
+        if (get_disk_ro(cmd->bdev->bd_disk))
+                return 0;
+        features = le32_to_cpu(disk_super->compat_ro_flags) & ~DM_CACHE_FEATURE_COMPAT_RO_SUPP;
+        if (features) {
+                DMERR("could not access metadata RDWR due to unsupported optional features (%lx).",
+                      (unsigned long)features);
+                return -EINVAL;
+        }
+        return 0;
+}
+static int __open_metadata(struct dm_cache_metadata *cmd)
+{
+        int r;
+        struct dm_block *sblock;
+        struct cache_disk_superblock *disk_super;
+        unsigned long sb_flags;
+        r = superblock_read_lock(cmd, &sblock);
+        if (r < 0) {
+                DMERR("couldn't read lock superblock");
+                return r;
+        }
+        disk_super = dm_block_data(sblock);
+        r = __check_incompat_features(disk_super, cmd);
+        if (r < 0)
+                goto bad;
+        r = dm_tm_open_with_sm(cmd->bm, CACHE_SUPERBLOCK_LOCATION,
+                               disk_super->metadata_space_map_root,
+                               sizeof(disk_super->metadata_space_map_root),
+                               &cmd->tm, &cmd->metadata_sm);
+        if (r < 0) {
+                DMERR("tm_open_with_sm failed");
+                goto bad;
+        }
+        __setup_mapping_info(cmd);
+        dm_disk_bitset_init(cmd->tm, &cmd->discard_info);
+        sb_flags = le32_to_cpu(disk_super->flags);
+        cmd->clean_when_opened = test_bit(CLEAN_SHUTDOWN, &sb_flags);
+        return dm_bm_unlock(sblock);
+bad:
+        dm_bm_unlock(sblock);
+        return r;
+}
+static int __open_or_format_metadata(struct dm_cache_metadata *cmd,
+                                     bool format_device)
+{
+        int r, unformatted;
+        r = __superblock_all_zeroes(cmd->bm, &unformatted);
+        if (r)
+                return r;
+        if (unformatted)
+                return format_device ? __format_metadata(cmd) : -EPERM;
+        return __open_metadata(cmd);
+}
+static int __create_persistent_data_objects(struct dm_cache_metadata *cmd,
+                                            bool may_format_device)
+{
+        int r;
+        cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE,
+                                          CACHE_METADATA_CACHE_SIZE,
+                                          CACHE_MAX_CONCURRENT_LOCKS);
+        if (IS_ERR(cmd->bm)) {
+                DMERR("could not create block manager");
+                return PTR_ERR(cmd->bm);
+        }
+        r = __open_or_format_metadata(cmd, may_format_device);
+        if (r)
+                dm_block_manager_destroy(cmd->bm);
+        return r;
+}
+static void __destroy_persistent_data_objects(struct dm_cache_metadata *cmd)
+{
+        dm_sm_destroy(cmd->metadata_sm);
+        dm_tm_destroy(cmd->tm);
+        dm_block_manager_destroy(cmd->bm);
+}
+typedef unsigned long (*flags_mutator)(unsigned long);
+static void update_flags(struct cache_disk_superblock *disk_super,
+                         flags_mutator mutator)
+{
+        uint32_t sb_flags = mutator(le32_to_cpu(disk_super->flags));
+        disk_super->flags = cpu_to_le32(sb_flags);
+}
+static unsigned long set_clean_shutdown(unsigned long flags)
+{
+        set_bit(CLEAN_SHUTDOWN, &flags);
+        return flags;
+}
+static unsigned long clear_clean_shutdown(unsigned long flags)
+{
+        clear_bit(CLEAN_SHUTDOWN, &flags);
+        return flags;
+}
+static void read_superblock_fields(struct dm_cache_metadata *cmd,
+                                   struct cache_disk_superblock *disk_super)
+{
+        cmd->root = le64_to_cpu(disk_super->mapping_root);
+        cmd->hint_root = le64_to_cpu(disk_super->hint_root);
+        cmd->discard_root = le64_to_cpu(disk_super->discard_root);
+        cmd->discard_block_size = le64_to_cpu(disk_super->discard_block_size);
+        cmd->discard_nr_blocks = to_dblock(le64_to_cpu(disk_super->discard_nr_blocks));
+        cmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
+        cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks));
+        strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name));
+        cmd->policy_hint_size = le32_to_cpu(disk_super->policy_hint_size);
+        cmd->stats.read_hits = le32_to_cpu(disk_super->read_hits);
+        cmd->stats.read_misses = le32_to_cpu(disk_super->read_misses);
+        cmd->stats.write_hits = le32_to_cpu(disk_super->write_hits);
+        cmd->stats.write_misses = le32_to_cpu(disk_super->write_misses);
+        cmd->changed = false;
+}
+/*
+ * The mutator updates the superblock flags.
+ */
+static int __begin_transaction_flags(struct dm_cache_metadata *cmd,
+                                     flags_mutator mutator)
+{
+        int r;
+        struct cache_disk_superblock *disk_super;
+        struct dm_block *sblock;
+        r = superblock_lock(cmd, &sblock);
+        if (r)
+                return r;
+        disk_super = dm_block_data(sblock);
+        update_flags(disk_super, mutator);
+        read_superblock_fields(cmd, disk_super);
+        return dm_bm_flush_and_unlock(cmd->bm, sblock);
+}
+static int __begin_transaction(struct dm_cache_metadata *cmd)
+{
+        int r;
+        struct cache_disk_superblock *disk_super;
+        struct dm_block *sblock;
+        /*
+         * We re-read the superblock every time.  Shouldn't need to do this
+         * really.
+         */
+        r = superblock_read_lock(cmd, &sblock);
+        if (r)
+                return r;
+        disk_super = dm_block_data(sblock);
+        read_superblock_fields(cmd, disk_super);
+        dm_bm_unlock(sblock);
+        return 0;
+}
+static int __commit_transaction(struct dm_cache_metadata *cmd,
+                                flags_mutator mutator)
+{
+        int r;
+        size_t metadata_len;
+        struct cache_disk_superblock *disk_super;
+        struct dm_block *sblock;
+        /*
+         * We need to know if the cache_disk_superblock exceeds a 512-byte sector.
+         */
+        BUILD_BUG_ON(sizeof(struct cache_disk_superblock) > 512);
+        r = dm_bitset_flush(&cmd->discard_info, cmd->discard_root,
+                            &cmd->discard_root);
+        if (r)
+                return r;
+        r = dm_tm_pre_commit(cmd->tm);
+        if (r < 0)
+                return r;
+        r = dm_sm_root_size(cmd->metadata_sm, &metadata_len);
+        if (r < 0)
+                return r;
+        r = superblock_lock(cmd, &sblock);
+        if (r)
+                return r;
+        disk_super = dm_block_data(sblock);
+        if (mutator)
+                update_flags(disk_super, mutator);
+        disk_super->mapping_root = cpu_to_le64(cmd->root);
+        disk_super->hint_root = cpu_to_le64(cmd->hint_root);
+        disk_super->discard_root = cpu_to_le64(cmd->discard_root);
+        disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
+        disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks));
+        disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks));
+        strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name));
+        disk_super->read_hits = cpu_to_le32(cmd->stats.read_hits);
+        disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses);
+        disk_super->write_hits = cpu_to_le32(cmd->stats.write_hits);
+        disk_super->write_misses = cpu_to_le32(cmd->stats.write_misses);
+        r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root,
+                            metadata_len);
+        if (r < 0) {
+                dm_bm_unlock(sblock);
+                return r;
+        }
+        return dm_tm_commit(cmd->tm, sblock);
+}
+/*----------------------------------------------------------------*/
+/*
+ * The mappings are held in a dm-array that has 64-bit values stored in
+ * little-endian format.  The index is the cblock, the high 48bits of the
+ * value are the oblock and the low 16 bit the flags.
+ */
+#define FLAGS_MASK ((1 << 16) - 1)
+static __le64 pack_value(dm_oblock_t block, unsigned flags)
+{
+        uint64_t value = from_oblock(block);
+        value <<= 16;
+        value = value | (flags & FLAGS_MASK);
+        return cpu_to_le64(value);
+}
+static void unpack_value(__le64 value_le, dm_oblock_t *block, unsigned *flags)
+{
+        uint64_t value = le64_to_cpu(value_le);
+        uint64_t b = value >> 16;
+        *block = to_oblock(b);
+        *flags = value & FLAGS_MASK;
+}
+/*----------------------------------------------------------------*/
+struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev,
+                                                 sector_t data_block_size,
+                                                 bool may_format_device,
+                                                 size_t policy_hint_size)
+{
+        int r;
+        struct dm_cache_metadata *cmd;
+        cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+        if (!cmd) {
+                DMERR("could not allocate metadata struct");
+                return NULL;
+        }
+        init_rwsem(&cmd->root_lock);
+        cmd->bdev = bdev;
+        cmd->data_block_size = data_block_size;
+        cmd->cache_blocks = 0;
+        cmd->policy_hint_size = policy_hint_size;
+        cmd->changed = true;
+        r = __create_persistent_data_objects(cmd, may_format_device);
+        if (r) {
+                kfree(cmd);
+                return ERR_PTR(r);
+        }
+        r = __begin_transaction_flags(cmd, clear_clean_shutdown);
+        if (r < 0) {
+                dm_cache_metadata_close(cmd);
+                return ERR_PTR(r);
+        }
+        return cmd;
+}
+void dm_cache_metadata_close(struct dm_cache_metadata *cmd)
+{
+        __destroy_persistent_data_objects(cmd);
+        kfree(cmd);
+}
+int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
+{
+        int r;
+        __le64 null_mapping = pack_value(0, 0);
+        down_write(&cmd->root_lock);
+        __dm_bless_for_disk(&null_mapping);
+        r = dm_array_resize(&cmd->info, cmd->root, from_cblock(cmd->cache_blocks),
+                            from_cblock(new_cache_size),
+                            &null_mapping, &cmd->root);
+        if (!r)
+                cmd->cache_blocks = new_cache_size;
+        cmd->changed = true;
+        up_write(&cmd->root_lock);
+        return r;
+}
+int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
+                                   sector_t discard_block_size,
+                                   dm_dblock_t new_nr_entries)
+{
+        int r;
+        down_write(&cmd->root_lock);
+        r = dm_bitset_resize(&cmd->discard_info,
+                             cmd->discard_root,
+                             from_dblock(cmd->discard_nr_blocks),
+                             from_dblock(new_nr_entries),
+                             false, &cmd->discard_root);
+        if (!r) {
+                cmd->discard_block_size = discard_block_size;
+                cmd->discard_nr_blocks = new_nr_entries;
+        }
+        cmd->changed = true;
+        up_write(&cmd->root_lock);
+        return r;
+}
+static int __set_discard(struct dm_cache_metadata *cmd, dm_dblock_t b)
+{
+        return dm_bitset_set_bit(&cmd->discard_info, cmd->discard_root,
+                                 from_dblock(b), &cmd->discard_root);
+}
+static int __clear_discard(struct dm_cache_metadata *cmd, dm_dblock_t b)
+{
+        return dm_bitset_clear_bit(&cmd->discard_info, cmd->discard_root,
+                                   from_dblock(b), &cmd->discard_root);
+}
+static int __is_discarded(struct dm_cache_metadata *cmd, dm_dblock_t b,
+                          bool *is_discarded)
+{
+        return dm_bitset_test_bit(&cmd->discard_info, cmd->discard_root,
+                                  from_dblock(b), &cmd->discard_root,
+                                  is_discarded);
+}
+static int __discard(struct dm_cache_metadata *cmd,
+                     dm_dblock_t dblock, bool discard)
+{
+        int r;
+        r = (discard ? __set_discard : __clear_discard)(cmd, dblock);
+        if (r)
+                return r;
+        cmd->changed = true;
+        return 0;
+}
+int dm_cache_set_discard(struct dm_cache_metadata *cmd,
+                         dm_dblock_t dblock, bool discard)
+{
+        int r;
+        down_write(&cmd->root_lock);
+        r = __discard(cmd, dblock, discard);
+        up_write(&cmd->root_lock);
+        return r;
+}
+static int __load_discards(struct dm_cache_metadata *cmd,
+                           load_discard_fn fn, void *context)
+{
+        int r = 0;
+        dm_block_t b;
+        bool discard;
+        for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) {
+                dm_dblock_t dblock = to_dblock(b);
+                if (cmd->clean_when_opened) {
+                        r = __is_discarded(cmd, dblock, &discard);
+                        if (r)
+                                return r;
+                } else
+                        discard = false;
+                r = fn(context, cmd->discard_block_size, dblock, discard);
+                if (r)
+                        break;
+        }
+        return r;
+}
+int dm_cache_load_discards(struct dm_cache_metadata *cmd,
+                           load_discard_fn fn, void *context)
+{
+        int r;
+        down_read(&cmd->root_lock);
+        r = __load_discards(cmd, fn, context);
+        up_read(&cmd->root_lock);
+        return r;
+}
+dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd)
+{
+        dm_cblock_t r;
+        down_read(&cmd->root_lock);
+        r = cmd->cache_blocks;
+        up_read(&cmd->root_lock);
+        return r;
+}
+static int __remove(struct dm_cache_metadata *cmd, dm_cblock_t cblock)
+{
+        int r;
+        __le64 value = pack_value(0, 0);
+        __dm_bless_for_disk(&value);
+        r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock),
+                               &value, &cmd->root);
+        if (r)
+                return r;
+        cmd->changed = true;
+        return 0;
+}
+int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock)
+{
+        int r;
+        down_write(&cmd->root_lock);
+        r = __remove(cmd, cblock);
+        up_write(&cmd->root_lock);
+        return r;
+}
+static int __insert(struct dm_cache_metadata *cmd,
+                    dm_cblock_t cblock, dm_oblock_t oblock)
+{
+        int r;
+        __le64 value = pack_value(oblock, M_VALID);
+        __dm_bless_for_disk(&value);
+        r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock),
+                               &value, &cmd->root);
+        if (r)
+                return r;
+        cmd->changed = true;
+        return 0;
+}
+int dm_cache_insert_mapping(struct dm_cache_metadata *cmd,
+                            dm_cblock_t cblock, dm_oblock_t oblock)
+{
+        int r;
+        down_write(&cmd->root_lock);
+        r = __insert(cmd, cblock, oblock);
+        up_write(&cmd->root_lock);
+        return r;
+}
+struct thunk {
+        load_mapping_fn fn;
+        void *context;
+        struct dm_cache_metadata *cmd;
+        bool respect_dirty_flags;
+        bool hints_valid;
+};
+static bool hints_array_initialized(struct dm_cache_metadata *cmd)
+{
+        return cmd->hint_root && cmd->policy_hint_size;
+}
+static bool hints_array_available(struct dm_cache_metadata *cmd,
+                                  const char *policy_name)
+{
+        bool policy_names_match = !strncmp(cmd->policy_name, policy_name,
+                                           sizeof(cmd->policy_name));
+        return cmd->clean_when_opened && policy_names_match &&
+                hints_array_initialized(cmd);
+}
+static int __load_mapping(void *context, uint64_t cblock, void *leaf)
+{
+        int r = 0;
+        bool dirty;
+        __le64 value;
+        __le32 hint_value = 0;
+        dm_oblock_t oblock;
+        unsigned flags;
+        struct thunk *thunk = context;
+        struct dm_cache_metadata *cmd = thunk->cmd;
+        memcpy(&value, leaf, sizeof(value));
+        unpack_value(value, &oblock, &flags);
+        if (flags & M_VALID) {
+                if (thunk->hints_valid) {
+                        r = dm_array_get_value(&cmd->hint_info, cmd->hint_root,
+                                               cblock, &hint_value);
+                        if (r && r != -ENODATA)
+                                return r;
+                }
+                dirty = thunk->respect_dirty_flags ? (flags & M_DIRTY) : true;
+                r = thunk->fn(thunk->context, oblock, to_cblock(cblock),
+                              dirty, le32_to_cpu(hint_value), thunk->hints_valid);
+        }
+        return r;
+}
+static int __load_mappings(struct dm_cache_metadata *cmd, const char *policy_name,
+                           load_mapping_fn fn, void *context)
+{
+        struct thunk thunk;
+        thunk.fn = fn;
+        thunk.context = context;
+        thunk.cmd = cmd;
+        thunk.respect_dirty_flags = cmd->clean_when_opened;
+        thunk.hints_valid = hints_array_available(cmd, policy_name);
+        return dm_array_walk(&cmd->info, cmd->root, __load_mapping, &thunk);
+}
+int dm_cache_load_mappings(struct dm_cache_metadata *cmd, const char *policy_name,
+                           load_mapping_fn fn, void *context)
+{
+        int r;
+        down_read(&cmd->root_lock);
+        r = __load_mappings(cmd, policy_name, fn, context);
+        up_read(&cmd->root_lock);
+        return r;
+}
+static int __dump_mapping(void *context, uint64_t cblock, void *leaf)
+{
+        int r = 0;
+        __le64 value;
+        dm_oblock_t oblock;
+        unsigned flags;
+        memcpy(&value, leaf, sizeof(value));
+        unpack_value(value, &oblock, &flags);
+        return r;
+}
+static int __dump_mappings(struct dm_cache_metadata *cmd)
+{
+        return dm_array_walk(&cmd->info, cmd->root, __dump_mapping, NULL);
+}
+void dm_cache_dump(struct dm_cache_metadata *cmd)
+{
+        down_read(&cmd->root_lock);
+        __dump_mappings(cmd);
+        up_read(&cmd->root_lock);
+}
+int dm_cache_changed_this_transaction(struct dm_cache_metadata *cmd)
+{
+        int r;
+        down_read(&cmd->root_lock);
+        r = cmd->changed;
+        up_read(&cmd->root_lock);
+        return r;
+}
+static int __dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty)
+{
+        int r;
+        unsigned flags;
+        dm_oblock_t oblock;
+        __le64 value;
+        r = dm_array_get_value(&cmd->info, cmd->root, from_cblock(cblock), &value);
+        if (r)
+                return r;
+        unpack_value(value, &oblock, &flags);
+        if (((flags & M_DIRTY) && dirty) || (!(flags & M_DIRTY) && !dirty))
+                /* nothing to be done */
+                return 0;
+        value = pack_value(oblock, flags | (dirty ? M_DIRTY : 0));
+        __dm_bless_for_disk(&value);
+        r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock),
+                               &value, &cmd->root);
+        if (r)
+                return r;
+        cmd->changed = true;
+        return 0;
+}
+int dm_cache_set_dirty(struct dm_cache_metadata *cmd,
+                       dm_cblock_t cblock, bool dirty)
+{
+        int r;
+        down_write(&cmd->root_lock);
+        r = __dirty(cmd, cblock, dirty);
+        up_write(&cmd->root_lock);
+        return r;
+}
+void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd,
+                                 struct dm_cache_statistics *stats)
+{
+        down_read(&cmd->root_lock);
+        memcpy(stats, &cmd->stats, sizeof(*stats));
+        up_read(&cmd->root_lock);
+}
+void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd,
+                                 struct dm_cache_statistics *stats)
+{
+        down_write(&cmd->root_lock);
+        memcpy(&cmd->stats, stats, sizeof(*stats));
+        up_write(&cmd->root_lock);
+}
+int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown)
+{
+        int r;
+        flags_mutator mutator = (clean_shutdown ? set_clean_shutdown :
+                                 clear_clean_shutdown);
+        down_write(&cmd->root_lock);
+        r = __commit_transaction(cmd, mutator);
+        if (r)
+                goto out;
+        r = __begin_transaction(cmd);
+out:
+        up_write(&cmd->root_lock);
+        return r;
+}
+int dm_cache_get_free_metadata_block_count(struct dm_cache_metadata *cmd,
+                                           dm_block_t *result)
+{
+        int r = -EINVAL;
+        down_read(&cmd->root_lock);
+        r = dm_sm_get_nr_free(cmd->metadata_sm, result);
+        up_read(&cmd->root_lock);
+        return r;
+}
+int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd,
+                                   dm_block_t *result)
+{
+        int r = -EINVAL;
+        down_read(&cmd->root_lock);
+        r = dm_sm_get_nr_blocks(cmd->metadata_sm, result);
+        up_read(&cmd->root_lock);
+        return r;
+}
+/*----------------------------------------------------------------*/
+static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy)
+{
+        int r;
+        __le32 value;
+        size_t hint_size;
+        const char *policy_name = dm_cache_policy_get_name(policy);
+        if (!policy_name[0] ||
+            (strlen(policy_name) > sizeof(cmd->policy_name) - 1))
+                return -EINVAL;
+        if (strcmp(cmd->policy_name, policy_name)) {
+                strncpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name));
+                hint_size = dm_cache_policy_get_hint_size(policy);
+                if (!hint_size)
+                        return 0; /* short-circuit hints initialization */
+                cmd->policy_hint_size = hint_size;
+                if (cmd->hint_root) {
+                        r = dm_array_del(&cmd->hint_info, cmd->hint_root);
+                        if (r)
+                                return r;
+                }
+                r = dm_array_empty(&cmd->hint_info, &cmd->hint_root);
+                if (r)
+                        return r;
+                value = cpu_to_le32(0);
+                __dm_bless_for_disk(&value);
+                r = dm_array_resize(&cmd->hint_info, cmd->hint_root, 0,
+                                    from_cblock(cmd->cache_blocks),
+                                    &value, &cmd->hint_root);
+                if (r)
+                        return r;
+        }
+        return 0;
+}
+int dm_cache_begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy)
+{
+        int r;
+        down_write(&cmd->root_lock);
+        r = begin_hints(cmd, policy);
+        up_write(&cmd->root_lock);
+        return r;
+}
+static int save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock,
+                     uint32_t hint)
+{
+        int r;
+        __le32 value = cpu_to_le32(hint);
+        __dm_bless_for_disk(&value);
+        r = dm_array_set_value(&cmd->hint_info, cmd->hint_root,
+                               from_cblock(cblock), &value, &cmd->hint_root);
+        cmd->changed = true;
+        return r;
+}
+int dm_cache_save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock,
+                       uint32_t hint)
+{
+        int r;
+        if (!hints_array_initialized(cmd))
+                return 0;
+        down_write(&cmd->root_lock);
+        r = save_hint(cmd, cblock, hint);
+        up_write(&cmd->root_lock);
+        return r;
+}
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
new file mode 100644
index 000000000000..135864ea0eee
--- /dev/null
+++ b/drivers/md/dm-cache-metadata.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+#ifndef DM_CACHE_METADATA_H
+#define DM_CACHE_METADATA_H
+#include "dm-cache-block-types.h"
+#include "dm-cache-policy-internal.h"
+/*----------------------------------------------------------------*/
+#define DM_CACHE_METADATA_BLOCK_SIZE 4096
+/* FIXME: remove this restriction */
+/*
+ * The metadata device is currently limited in size.
+ *
+ * We have one block of index, which can hold 255 index entries.  Each
+ * index entry contains allocation info about 16k metadata blocks.
+ */
+#define DM_CACHE_METADATA_MAX_SECTORS (255 * (1 << 14) * (DM_CACHE_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
+/*
+ * A metadata device larger than 16GB triggers a warning.
+ */
+#define DM_CACHE_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT))
+/*----------------------------------------------------------------*/
+/*
+ * Ext[234]-style compat feature flags.
+ *
+ * A new feature which old metadata will still be compatible with should
+ * define a DM_CACHE_FEATURE_COMPAT_* flag (rarely useful).
+ *
+ * A new feature that is not compatible with old code should define a
+ * DM_CACHE_FEATURE_INCOMPAT_* flag and guard the relevant code with
+ * that flag.
+ *
+ * A new feature that is not compatible with old code accessing the
+ * metadata RDWR should define a DM_CACHE_FEATURE_RO_COMPAT_* flag and
+ * guard the relevant code with that flag.
+ *
+ * As these various flags are defined they should be added to the
+ * following masks.
+ */
+#define DM_CACHE_FEATURE_COMPAT_SUPP      0UL
+#define DM_CACHE_FEATURE_COMPAT_RO_SUPP   0UL
+#define DM_CACHE_FEATURE_INCOMPAT_SUPP    0UL
+/*
+ * Reopens or creates a new, empty metadata volume.
+ * Returns an ERR_PTR on failure.
+ */
+struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev,
+                                                 sector_t data_block_size,
+                                                 bool may_format_device,
+                                                 size_t policy_hint_size);
+void dm_cache_metadata_close(struct dm_cache_metadata *cmd);
+/*
+ * The metadata needs to know how many cache blocks there are.  We don't
+ * care about the origin, assuming the core target is giving us valid
+ * origin blocks to map to.
+ */
+int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size);
+dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd);
+int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
+                                   sector_t discard_block_size,
+                                   dm_dblock_t new_nr_entries);
+typedef int (*load_discard_fn)(void *context, sector_t discard_block_size,
+                               dm_dblock_t dblock, bool discarded);
+int dm_cache_load_discards(struct dm_cache_metadata *cmd,
+                           load_discard_fn fn, void *context);
+int dm_cache_set_discard(struct dm_cache_metadata *cmd, dm_dblock_t dblock, bool discard);
+int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock);
+int dm_cache_insert_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock, dm_oblock_t oblock);
+int dm_cache_changed_this_transaction(struct dm_cache_metadata *cmd);
+typedef int (*load_mapping_fn)(void *context, dm_oblock_t oblock,
+                               dm_cblock_t cblock, bool dirty,
+                               uint32_t hint, bool hint_valid);
+int dm_cache_load_mappings(struct dm_cache_metadata *cmd,
+                           const char *policy_name,
+                           load_mapping_fn fn,
+                           void *context);
+int dm_cache_set_dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty);
+struct dm_cache_statistics {
+        uint32_t read_hits;
+        uint32_t read_misses;
+        uint32_t write_hits;
+        uint32_t write_misses;
+};
+void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd,
+                                 struct dm_cache_statistics *stats);
+void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd,
+                                 struct dm_cache_statistics *stats);
+int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown);
+int dm_cache_get_free_metadata_block_count(struct dm_cache_metadata *cmd,
+                                           dm_block_t *result);
+int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd,
+                                   dm_block_t *result);
+void dm_cache_dump(struct dm_cache_metadata *cmd);
+/*
+ * The policy is invited to save a 32bit hint value for every cblock (eg,
+ * for a hit count).  These are stored against the policy name.  If
+ * policies are changed, then hints will be lost.  If the machine crashes,
+ * hints will be lost.
+ *
+ * The hints are indexed by the cblock, but many policies will not
+ * neccessarily have a fast way of accessing efficiently via cblock.  So
+ * rather than querying the policy for each cblock, we let it walk its data
+ * structures and fill in the hints in whatever order it wishes.
+ */
+int dm_cache_begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *p);
+/*
+ * requests hints for every cblock and stores in the metadata device.
+ */
+int dm_cache_save_hint(struct dm_cache_metadata *cmd,
+                       dm_cblock_t cblock, uint32_t hint);
+/*----------------------------------------------------------------*/
+#endif /* DM_CACHE_METADATA_H */
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
new file mode 100644
index 000000000000..52a75beeced5
--- /dev/null
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (C) 2012 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+#ifndef DM_CACHE_POLICY_INTERNAL_H
+#define DM_CACHE_POLICY_INTERNAL_H
+#include "dm-cache-policy.h"
+/*----------------------------------------------------------------*/
+/*
+ * Little inline functions that simplify calling the policy methods.
+ */
+static inline int policy_map(struct dm_cache_policy *p, dm_oblock_t oblock,
+                             bool can_block, bool can_migrate, bool discarded_oblock,
+                             struct bio *bio, struct policy_result *result)
+{
+        return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, result);
+}
+static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
+{
+        BUG_ON(!p->lookup);
+        return p->lookup(p, oblock, cblock);
+}
+static inline void policy_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+        if (p->set_dirty)
+                p->set_dirty(p, oblock);
+}
+static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+        if (p->clear_dirty)
+                p->clear_dirty(p, oblock);
+}
+static inline int policy_load_mapping(struct dm_cache_policy *p,
+                                      dm_oblock_t oblock, dm_cblock_t cblock,
+                                      uint32_t hint, bool hint_valid)
+{
+        return p->load_mapping(p, oblock, cblock, hint, hint_valid);
+}
+static inline int policy_walk_mappings(struct dm_cache_policy *p,
+                                      policy_walk_fn fn, void *context)
+{
+        return p->walk_mappings ? p->walk_mappings(p, fn, context) : 0;
+}
+static inline int policy_writeback_work(struct dm_cache_policy *p,
+                                        dm_oblock_t *oblock,
+                                        dm_cblock_t *cblock)
+{
+        return p->writeback_work ? p->writeback_work(p, oblock, cblock) : -ENOENT;
+}
+static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+        return p->remove_mapping(p, oblock);
+}
+static inline void policy_force_mapping(struct dm_cache_policy *p,
+                                        dm_oblock_t current_oblock, dm_oblock_t new_oblock)
+{
+        return p->force_mapping(p, current_oblock, new_oblock);
+}
+static inline dm_cblock_t policy_residency(struct dm_cache_policy *p)
+{
+        return p->residency(p);
+}
+static inline void policy_tick(struct dm_cache_policy *p)
+{
+        if (p->tick)
+                return p->tick(p);
+}
+static inline int policy_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen)
+{
+        ssize_t sz = 0;
+        if (p->emit_config_values)
+                return p->emit_config_values(p, result, maxlen);
+        DMEMIT("0");
+        return 0;
+}
+static inline int policy_set_config_value(struct dm_cache_policy *p,
+                                          const char *key, const char *value)
+{
+        return p->set_config_value ? p->set_config_value(p, key, value) : -EINVAL;
+}
+/*----------------------------------------------------------------*/
+/*
+ * Creates a new cache policy given a policy name, a cache size, an origin size and the block size.
+ */
+struct dm_cache_policy *dm_cache_policy_create(const char *name, dm_cblock_t cache_size,
+                                               sector_t origin_size, sector_t block_size);
+/*
+ * Destroys the policy.  This drops references to the policy module as well
+ * as calling it's destroy method.  So always use this rather than calling
+ * the policy->destroy method directly.
+ */
+void dm_cache_policy_destroy(struct dm_cache_policy *p);
+/*
+ * In case we've forgotten.
+ */
+const char *dm_cache_policy_get_name(struct dm_cache_policy *p);
+size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p);
+/*----------------------------------------------------------------*/
+#endif /* DM_CACHE_POLICY_INTERNAL_H */
diff --git a/drivers/md/dm-cache-policy.c b/drivers/md/dm-cache-policy.c
new file mode 100644
index 000000000000..2cbf5fdaac52
--- /dev/null
+++ b/drivers/md/dm-cache-policy.c
@@ -0,0 +1,161 @@
+/*
+ * Copyright (C) 2012 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+#include "dm-cache-policy-internal.h"
+#include "dm.h"
+#include <linux/module.h>
+#include <linux/slab.h>
+/*----------------------------------------------------------------*/
+#define DM_MSG_PREFIX "cache-policy"
+static DEFINE_SPINLOCK(register_lock);
+static LIST_HEAD(register_list);
+static struct dm_cache_policy_type *__find_policy(const char *name)
+{
+        struct dm_cache_policy_type *t;
+        list_for_each_entry(t, &register_list, list)
+                if (!strcmp(t->name, name))
+                        return t;
+        return NULL;
+}
+static struct dm_cache_policy_type *__get_policy_once(const char *name)
+{
+        struct dm_cache_policy_type *t = __find_policy(name);
+        if (t && !try_module_get(t->owner)) {
+                DMWARN("couldn't get module %s", name);
+                t = ERR_PTR(-EINVAL);
+        }
+        return t;
+}
+static struct dm_cache_policy_type *get_policy_once(const char *name)
+{
+        struct dm_cache_policy_type *t;
+        spin_lock(&register_lock);
+        t = __get_policy_once(name);
+        spin_unlock(&register_lock);
+        return t;
+}
+static struct dm_cache_policy_type *get_policy(const char *name)
+{
+        struct dm_cache_policy_type *t;
+        t = get_policy_once(name);
+        if (IS_ERR(t))
+                return NULL;
+        if (t)
+                return t;
+        request_module("dm-cache-%s", name);
+        t = get_policy_once(name);
+        if (IS_ERR(t))
+                return NULL;
+        return t;
+}
+static void put_policy(struct dm_cache_policy_type *t)
+{
+        module_put(t->owner);
+}
+int dm_cache_policy_register(struct dm_cache_policy_type *type)
+{
+        int r;
+        /* One size fits all for now */
+        if (type->hint_size != 0 && type->hint_size != 4) {
+                DMWARN("hint size must be 0 or 4 but %llu supplied.", (unsigned long long) type->hint_size);
+                return -EINVAL;
+        }
+        spin_lock(&register_lock);
+        if (__find_policy(type->name)) {
+                DMWARN("attempt to register policy under duplicate name %s", type->name);
+                r = -EINVAL;
+        } else {
+                list_add(&type->list, &register_list);
+                r = 0;
+        }
+        spin_unlock(&register_lock);
+        return r;
+}
+EXPORT_SYMBOL_GPL(dm_cache_policy_register);
+void dm_cache_policy_unregister(struct dm_cache_policy_type *type)
+{
+        spin_lock(&register_lock);
+        list_del_init(&type->list);
+        spin_unlock(&register_lock);
+}
+EXPORT_SYMBOL_GPL(dm_cache_policy_unregister);
+struct dm_cache_policy *dm_cache_policy_create(const char *name,
+                                               dm_cblock_t cache_size,
+                                               sector_t origin_size,
+                                               sector_t cache_block_size)
+{
+        struct dm_cache_policy *p = NULL;
+        struct dm_cache_policy_type *type;
+        type = get_policy(name);
+        if (!type) {
+                DMWARN("unknown policy type");
+                return NULL;
+        }
+        p = type->create(cache_size, origin_size, cache_block_size);
+        if (!p) {
+                put_policy(type);
+                return NULL;
+        }
+        p->private = type;
+        return p;
+}
+EXPORT_SYMBOL_GPL(dm_cache_policy_create);
+void dm_cache_policy_destroy(struct dm_cache_policy *p)
+{
+        struct dm_cache_policy_type *t = p->private;
+        p->destroy(p);
+        put_policy(t);
+}
+EXPORT_SYMBOL_GPL(dm_cache_policy_destroy);
+const char *dm_cache_policy_get_name(struct dm_cache_policy *p)
+{
+        struct dm_cache_policy_type *t = p->private;
+        return t->name;
+}
+EXPORT_SYMBOL_GPL(dm_cache_policy_get_name);
+size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p)
+{
+        struct dm_cache_policy_type *t = p->private;
+        return t->hint_size;
+}
+EXPORT_SYMBOL_GPL(dm_cache_policy_get_hint_size);
+/*----------------------------------------------------------------*/
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
new file mode 100644
index 000000000000..f0f51b260544
--- /dev/null
+++ b/drivers/md/dm-cache-policy.h
@@ -0,0 +1,228 @@
+/*
+ * Copyright (C) 2012 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+#ifndef DM_CACHE_POLICY_H
+#define DM_CACHE_POLICY_H
+#include "dm-cache-block-types.h"
+#include <linux/device-mapper.h>
+/*----------------------------------------------------------------*/
+/* FIXME: make it clear which methods are optional.  Get debug policy to
+ * double check this at start.
+ */
+/*
+ * The cache policy makes the important decisions about which blocks get to
+ * live on the faster cache device.
+ *
+ * When the core target has to remap a bio it calls the 'map' method of the
+ * policy.  This returns an instruction telling the core target what to do.
+ *
+ * POLICY_HIT:
+ *   That block is in the cache.  Remap to the cache and carry on.
+ *
+ * POLICY_MISS:
+ *   This block is on the origin device.  Remap and carry on.
+ *
+ * POLICY_NEW:
+ *   This block is currently on the origin device, but the policy wants to
+ *   move it.  The core should:
+ *
+ *   - hold any further io to this origin block
+ *   - copy the origin to the given cache block
+ *   - release all the held blocks
+ *   - remap the original block to the cache
+ *
+ * POLICY_REPLACE:
+ *   This block is currently on the origin device.  The policy wants to
+ *   move it to the cache, with the added complication that the destination
+ *   cache block needs a writeback first.  The core should:
+ *
+ *   - hold any further io to this origin block
+ *   - hold any further io to the origin block that's being written back
+ *   - writeback
+ *   - copy new block to cache
+ *   - release held blocks
+ *   - remap bio to cache and reissue.
+ *
+ * Should the core run into trouble while processing a POLICY_NEW or
+ * POLICY_REPLACE instruction it will roll back the policies mapping using
+ * remove_mapping() or force_mapping().  These methods must not fail.  This
+ * approach avoids having transactional semantics in the policy (ie, the
+ * core informing the policy when a migration is complete), and hence makes
+ * it easier to write new policies.
+ *
+ * In general policy methods should never block, except in the case of the
+ * map function when can_migrate is set.  So be careful to implement using
+ * bounded, preallocated memory.
+ */
+enum policy_operation {
+        POLICY_HIT,
+        POLICY_MISS,
+        POLICY_NEW,
+        POLICY_REPLACE
+};
+/*
+ * This is the instruction passed back to the core target.
+ */
+struct policy_result {
+        enum policy_operation op;
+        dm_oblock_t old_oblock; /* POLICY_REPLACE */
+        dm_cblock_t cblock;     /* POLICY_HIT, POLICY_NEW, POLICY_REPLACE */
+};
+typedef int (*policy_walk_fn)(void *context, dm_cblock_t cblock,
+                              dm_oblock_t oblock, uint32_t hint);
+/*
+ * The cache policy object.  Just a bunch of methods.  It is envisaged that
+ * this structure will be embedded in a bigger, policy specific structure
+ * (ie. use container_of()).
+ */
+struct dm_cache_policy {
+        /*
+         * FIXME: make it clear which methods are optional, and which may
+         * block.
+         */
+        /*
+         * Destroys this object.
+         */
+        void (*destroy)(struct dm_cache_policy *p);
+        /*
+         * See large comment above.
+         *
+         * oblock      - the origin block we're interested in.
+         *
+         * can_block - indicates whether the current thread is allowed to
+         *             block.  -EWOULDBLOCK returned if it can't and would.
+         *
+         * can_migrate - gives permission for POLICY_NEW or POLICY_REPLACE
+         *               instructions.  If denied and the policy would have
+         *               returned one of these instructions it should
+         *               return -EWOULDBLOCK.
+         *
+         * discarded_oblock - indicates whether the whole origin block is
+         *               in a discarded state (FIXME: better to tell the
+         *               policy about this sooner, so it can recycle that
+         *               cache block if it wants.)
+         * bio         - the bio that triggered this call.
+         * result      - gets filled in with the instruction.
+         *
+         * May only return 0, or -EWOULDBLOCK (if !can_migrate)
+         */
+        int (*map)(struct dm_cache_policy *p, dm_oblock_t oblock,
+                   bool can_block, bool can_migrate, bool discarded_oblock,
+                   struct bio *bio, struct policy_result *result);
+        /*
+         * Sometimes we want to see if a block is in the cache, without
+         * triggering any update of stats.  (ie. it's not a real hit).
+         *
+         * Must not block.
+         *
+         * Returns 1 iff in cache, 0 iff not, < 0 on error (-EWOULDBLOCK
+         * would be typical).
+         */
+        int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock);
+        /*
+         * oblock must be a mapped block.  Must not block.
+         */
+        void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
+        void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
+        /*
+         * Called when a cache target is first created.  Used to load a
+         * mapping from the metadata device into the policy.
+         */
+        int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock,
+                            dm_cblock_t cblock, uint32_t hint, bool hint_valid);
+        int (*walk_mappings)(struct dm_cache_policy *p, policy_walk_fn fn,
+                             void *context);
+        /*
+         * Override functions used on the error paths of the core target.
+         * They must succeed.
+         */
+        void (*remove_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock);
+        void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock,
+                              dm_oblock_t new_oblock);
+        int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock);
+        /*
+         * How full is the cache?
+         */
+        dm_cblock_t (*residency)(struct dm_cache_policy *p);
+        /*
+         * Because of where we sit in the block layer, we can be asked to
+         * map a lot of little bios that are all in the same block (no
+         * queue merging has occurred).  To stop the policy being fooled by
+         * these the core target sends regular tick() calls to the policy.
+         * The policy should only count an entry as hit once per tick.
+         */
+        void (*tick)(struct dm_cache_policy *p);
+        /*
+         * Configuration.
+         */
+        int (*emit_config_values)(struct dm_cache_policy *p,
+                                  char *result, unsigned maxlen);
+        int (*set_config_value)(struct dm_cache_policy *p,
+                                const char *key, const char *value);
+        /*
+         * Book keeping ptr for the policy register, not for general use.
+         */
+        void *private;
+};
+/*----------------------------------------------------------------*/
+/*
+ * We maintain a little register of the different policy types.
+ */
+#define CACHE_POLICY_NAME_SIZE 16
+struct dm_cache_policy_type {
+        /* For use by the register code only. */
+        struct list_head list;
+        /*
+         * Policy writers should fill in these fields.  The name field is
+         * what gets passed on the target line to select your policy.
+         */
+        char name[CACHE_POLICY_NAME_SIZE];
+        /*
+         * Policies may store a hint for each each cache block.
+         * Currently the size of this hint must be 0 or 4 bytes but we
+         * expect to relax this in future.
+         */
+        size_t hint_size;
+        struct module *owner;
+        struct dm_cache_policy *(*create)(dm_cblock_t cache_size,
+                                          sector_t origin_size,
+                                          sector_t block_size);
+};
+int dm_cache_policy_register(struct dm_cache_policy_type *type);
+void dm_cache_policy_unregister(struct dm_cache_policy_type *type);
+/*----------------------------------------------------------------*/
+#endif  /* DM_CACHE_POLICY_H */
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
new file mode 100644
index 000000000000..0f4e84b15c30
--- /dev/null
+++ b/drivers/md/dm-cache-target.c
@@ -0,0 +1,2584 @@
+/*
+ * Copyright (C) 2012 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+#include "dm.h"
+#include "dm-bio-prison.h"
+#include "dm-cache-metadata.h"
+#include <linux/dm-io.h>
+#include <linux/dm-kcopyd.h>
+#include <linux/init.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#define DM_MSG_PREFIX "cache"
+DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
+        "A percentage of time allocated for copying to and/or from cache");
+/*----------------------------------------------------------------*/
+/*
+ * Glossary:
+ *
+ * oblock: index of an origin block
+ * cblock: index of a cache block
+ * promotion: movement of a block from origin to cache
+ * demotion: movement of a block from cache to origin
+ * migration: movement of a block between the origin and cache device,
+ *            either direction
+ */
+/*----------------------------------------------------------------*/
+static size_t bitset_size_in_bytes(unsigned nr_entries)
+{
+        return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
+}
+static unsigned long *alloc_bitset(unsigned nr_entries)
+{
+        size_t s = bitset_size_in_bytes(nr_entries);
+        return vzalloc(s);
+}
+static void clear_bitset(void *bitset, unsigned nr_entries)
+{
+        size_t s = bitset_size_in_bytes(nr_entries);
+        memset(bitset, 0, s);
+}
+static void free_bitset(unsigned long *bits)
+{
+        vfree(bits);
+}
+/*----------------------------------------------------------------*/
+#define PRISON_CELLS 1024
+#define MIGRATION_POOL_SIZE 128
+#define COMMIT_PERIOD HZ
+#define MIGRATION_COUNT_WINDOW 10
+/*
+ * The block size of the device holding cache data must be >= 32KB
+ */
+#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
+/*
+ * FIXME: the cache is read/write for the time being.
+ */
+enum cache_mode {
+        CM_WRITE,               /* metadata may be changed */
+        CM_READ_ONLY,           /* metadata may not be changed */
+};
+struct cache_features {
+        enum cache_mode mode;
+        bool write_through:1;
+};
+struct cache_stats {
+        atomic_t read_hit;
+        atomic_t read_miss;
+        atomic_t write_hit;
+        atomic_t write_miss;
+        atomic_t demotion;
+        atomic_t promotion;
+        atomic_t copies_avoided;
+        atomic_t cache_cell_clash;
+        atomic_t commit_count;
+        atomic_t discard_count;
+};
+struct cache {
+        struct dm_target *ti;
+        struct dm_target_callbacks callbacks;
+        /*
+         * Metadata is written to this device.
+         */
+        struct dm_dev *metadata_dev;
+        /*
+         * The slower of the two data devices.  Typically a spindle.
+         */
+        struct dm_dev *origin_dev;
+        /*
+         * The faster of the two data devices.  Typically an SSD.
+         */
+        struct dm_dev *cache_dev;
+        /*
+         * Cache features such as write-through.
+         */
+        struct cache_features features;
+        /*
+         * Size of the origin device in _complete_ blocks and native sectors.
+         */
+        dm_oblock_t origin_blocks;
+        sector_t origin_sectors;
+        /*
+         * Size of the cache device in blocks.
+         */
+        dm_cblock_t cache_size;
+        /*
+         * Fields for converting from sectors to blocks.
+         */
+        uint32_t sectors_per_block;
+        int sectors_per_block_shift;
+        struct dm_cache_metadata *cmd;
+        spinlock_t lock;
+        struct bio_list deferred_bios;
+        struct bio_list deferred_flush_bios;
+        struct list_head quiesced_migrations;
+        struct list_head completed_migrations;
+        struct list_head need_commit_migrations;
+        sector_t migration_threshold;
+        atomic_t nr_migrations;
+        wait_queue_head_t migration_wait;
+        /*
+         * cache_size entries, dirty if set
+         */
+        dm_cblock_t nr_dirty;
+        unsigned long *dirty_bitset;
+        /*
+         * origin_blocks entries, discarded if set.
+         */
+        sector_t discard_block_size; /* a power of 2 times sectors per block */
+        dm_dblock_t discard_nr_blocks;
+        unsigned long *discard_bitset;
+        struct dm_kcopyd_client *copier;
+        struct workqueue_struct *wq;
+        struct work_struct worker;
+        struct delayed_work waker;
+        unsigned long last_commit_jiffies;
+        struct dm_bio_prison *prison;
+        struct dm_deferred_set *all_io_ds;
+        mempool_t *migration_pool;
+        struct dm_cache_migration *next_migration;
+        struct dm_cache_policy *policy;
+        unsigned policy_nr_args;
+        bool need_tick_bio:1;
+        bool sized:1;
+        bool quiescing:1;
+        bool commit_requested:1;
+        bool loaded_mappings:1;
+        bool loaded_discards:1;
+        struct cache_stats stats;
+        /*
+         * Rather than reconstructing the table line for the status we just
+         * save it and regurgitate.
+         */
+        unsigned nr_ctr_args;
+        const char **ctr_args;
+};
+struct per_bio_data {
+        bool tick:1;
+        unsigned req_nr:2;
+        struct dm_deferred_entry *all_io_entry;
+};
+struct dm_cache_migration {
+        struct list_head list;
+        struct cache *cache;
+        unsigned long start_jiffies;
+        dm_oblock_t old_oblock;
+        dm_oblock_t new_oblock;
+        dm_cblock_t cblock;
+        bool err:1;
+        bool writeback:1;
+        bool demote:1;
+        bool promote:1;
+        struct dm_bio_prison_cell *old_ocell;
+        struct dm_bio_prison_cell *new_ocell;
+};
+/*
+ * Processing a bio in the worker thread may require these memory
+ * allocations.  We prealloc to avoid deadlocks (the same worker thread
+ * frees them back to the mempool).
+ */
+struct prealloc {
+        struct dm_cache_migration *mg;
+        struct dm_bio_prison_cell *cell1;
+        struct dm_bio_prison_cell *cell2;
+};
+static void wake_worker(struct cache *cache)
+{
+        queue_work(cache->wq, &cache->worker);
+}
+/*----------------------------------------------------------------*/
+static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
+{
+        /* FIXME: change to use a local slab. */
+        return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
+}
+static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
+{
+        dm_bio_prison_free_cell(cache->prison, cell);
+}
+static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
+{
+        if (!p->mg) {
+                p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
+                if (!p->mg)
+                        return -ENOMEM;
+        }
+        if (!p->cell1) {
+                p->cell1 = alloc_prison_cell(cache);
+                if (!p->cell1)
+                        return -ENOMEM;
+        }
+        if (!p->cell2) {
+                p->cell2 = alloc_prison_cell(cache);
+                if (!p->cell2)
+                        return -ENOMEM;
+        }
+        return 0;
+}
+static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
+{
+        if (p->cell2)
+                free_prison_cell(cache, p->cell2);
+        if (p->cell1)
+                free_prison_cell(cache, p->cell1);
+        if (p->mg)
+                mempool_free(p->mg, cache->migration_pool);
+}
+static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
+{
+        struct dm_cache_migration *mg = p->mg;
+        BUG_ON(!mg);
+        p->mg = NULL;
+        return mg;
+}
+/*
+ * You must have a cell within the prealloc struct to return.  If not this
+ * function will BUG() rather than returning NULL.
+ */
+static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
+{
+        struct dm_bio_prison_cell *r = NULL;
+        if (p->cell1) {
+                r = p->cell1;
+                p->cell1 = NULL;
+        } else if (p->cell2) {
+                r = p->cell2;
+                p->cell2 = NULL;
+        } else
+                BUG();
+        return r;
+}
+/*
+ * You can't have more than two cells in a prealloc struct.  BUG() will be
+ * called if you try and overfill.
+ */
+static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
+{
+        if (!p->cell2)
+                p->cell2 = cell;
+        else if (!p->cell1)
+                p->cell1 = cell;
+        else
+                BUG();
+}
+/*----------------------------------------------------------------*/
+static void build_key(dm_oblock_t oblock, struct dm_cell_key *key)
+{
+        key->virtual = 0;
+        key->dev = 0;
+        key->block = from_oblock(oblock);
+}
+/*
+ * The caller hands in a preallocated cell, and a free function for it.
+ * The cell will be freed if there's an error, or if it wasn't used because
+ * a cell with that key already exists.
+ */
+typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
+static int bio_detain(struct cache *cache, dm_oblock_t oblock,
+                      struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
+                      cell_free_fn free_fn, void *free_context,
+                      struct dm_bio_prison_cell **cell_result)
+{
+        int r;
+        struct dm_cell_key key;
+        build_key(oblock, &key);
+        r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
+        if (r)
+                free_fn(free_context, cell_prealloc);
+        return r;
+}
+static int get_cell(struct cache *cache,
+                    dm_oblock_t oblock,
+                    struct prealloc *structs,
+                    struct dm_bio_prison_cell **cell_result)
+{
+        int r;
+        struct dm_cell_key key;
+        struct dm_bio_prison_cell *cell_prealloc;
+        cell_prealloc = prealloc_get_cell(structs);
+        build_key(oblock, &key);
+        r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
+        if (r)
+                prealloc_put_cell(structs, cell_prealloc);
+        return r;
+}
+ /*----------------------------------------------------------------*/
+static bool is_dirty(struct cache *cache, dm_cblock_t b)
+{
+        return test_bit(from_cblock(b), cache->dirty_bitset);
+}
+static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
+{
+        if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
+                cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) + 1);
+                policy_set_dirty(cache->policy, oblock);
+        }
+}
+static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
+{
+        if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
+                policy_clear_dirty(cache->policy, oblock);
+                cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) - 1);
+                if (!from_cblock(cache->nr_dirty))
+                        dm_table_event(cache->ti->table);
+        }
+}
+/*----------------------------------------------------------------*/
+static bool block_size_is_power_of_two(struct cache *cache)
+{
+        return cache->sectors_per_block_shift >= 0;
+}
+static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
+{
+        sector_t discard_blocks = cache->discard_block_size;
+        dm_block_t b = from_oblock(oblock);
+        if (!block_size_is_power_of_two(cache))
+                (void) sector_div(discard_blocks, cache->sectors_per_block);
+        else
+                discard_blocks >>= cache->sectors_per_block_shift;
+        (void) sector_div(b, discard_blocks);
+        return to_dblock(b);
+}
+static void set_discard(struct cache *cache, dm_dblock_t b)
+{
+        unsigned long flags;
+        atomic_inc(&cache->stats.discard_count);
+        spin_lock_irqsave(&cache->lock, flags);
+        set_bit(from_dblock(b), cache->discard_bitset);
+        spin_unlock_irqrestore(&cache->lock, flags);
+}
+static void clear_discard(struct cache *cache, dm_dblock_t b)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&cache->lock, flags);
+        clear_bit(from_dblock(b), cache->discard_bitset);
+        spin_unlock_irqrestore(&cache->lock, flags);
+}
+static bool is_discarded(struct cache *cache, dm_dblock_t b)
+{
+        int r;
+        unsigned long flags;
+        spin_lock_irqsave(&cache->lock, flags);
+        r = test_bit(from_dblock(b), cache->discard_bitset);
+        spin_unlock_irqrestore(&cache->lock, flags);
+        return r;
+}
+static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
+{
+        int r;
+        unsigned long flags;
+        spin_lock_irqsave(&cache->lock, flags);
+        r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
+                     cache->discard_bitset);
+        spin_unlock_irqrestore(&cache->lock, flags);
+        return r;
+}
+/*----------------------------------------------------------------*/
+static void load_stats(struct cache *cache)
+{
+        struct dm_cache_statistics stats;
+        dm_cache_metadata_get_stats(cache->cmd, &stats);
+        atomic_set(&cache->stats.read_hit, stats.read_hits);
+        atomic_set(&cache->stats.read_miss, stats.read_misses);
+        atomic_set(&cache->stats.write_hit, stats.write_hits);
+        atomic_set(&cache->stats.write_miss, stats.write_misses);
+}
+static void save_stats(struct cache *cache)
+{
+        struct dm_cache_statistics stats;
+        stats.read_hits = atomic_read(&cache->stats.read_hit);
+        stats.read_misses = atomic_read(&cache->stats.read_miss);
+        stats.write_hits = atomic_read(&cache->stats.write_hit);
+        stats.write_misses = atomic_read(&cache->stats.write_miss);
+        dm_cache_metadata_set_stats(cache->cmd, &stats);
+}
+/*----------------------------------------------------------------
+ * Per bio data
+ *--------------------------------------------------------------*/
+static struct per_bio_data *get_per_bio_data(struct bio *bio)
+{
+        struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
+        BUG_ON(!pb);
+        return pb;
+}
+static struct per_bio_data *init_per_bio_data(struct bio *bio)
+{
+        struct per_bio_data *pb = get_per_bio_data(bio);
+        pb->tick = false;
+        pb->req_nr = dm_bio_get_target_bio_nr(bio);
+        pb->all_io_entry = NULL;
+        return pb;
+}
+/*----------------------------------------------------------------
+ * Remapping
+ *--------------------------------------------------------------*/
+static void remap_to_origin(struct cache *cache, struct bio *bio)
+{
+        bio->bi_bdev = cache->origin_dev->bdev;
+}
+static void remap_to_cache(struct cache *cache, struct bio *bio,
+                           dm_cblock_t cblock)
+{
+        sector_t bi_sector = bio->bi_sector;
+        bio->bi_bdev = cache->cache_dev->bdev;
+        if (!block_size_is_power_of_two(cache))
+                bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) +
+                                sector_div(bi_sector, cache->sectors_per_block);
+        else
+                bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) |
+                                (bi_sector & (cache->sectors_per_block - 1));
+}
+static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
+{
+        unsigned long flags;
+        struct per_bio_data *pb = get_per_bio_data(bio);
+        spin_lock_irqsave(&cache->lock, flags);
+        if (cache->need_tick_bio &&
+            !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) {
+                pb->tick = true;
+                cache->need_tick_bio = false;
+        }
+        spin_unlock_irqrestore(&cache->lock, flags);
+}
+static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
+                                  dm_oblock_t oblock)
+{
+        check_if_tick_bio_needed(cache, bio);
+        remap_to_origin(cache, bio);
+        if (bio_data_dir(bio) == WRITE)
+                clear_discard(cache, oblock_to_dblock(cache, oblock));
+}
+static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
+                                 dm_oblock_t oblock, dm_cblock_t cblock)
+{
+        remap_to_cache(cache, bio, cblock);
+        if (bio_data_dir(bio) == WRITE) {
+                set_dirty(cache, oblock, cblock);
+                clear_discard(cache, oblock_to_dblock(cache, oblock));
+        }
+}
+static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
+{
+        sector_t block_nr = bio->bi_sector;
+        if (!block_size_is_power_of_two(cache))
+                (void) sector_div(block_nr, cache->sectors_per_block);
+        else
+                block_nr >>= cache->sectors_per_block_shift;
+        return to_oblock(block_nr);
+}
+static int bio_triggers_commit(struct cache *cache, struct bio *bio)
+{
+        return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
+}
+static void issue(struct cache *cache, struct bio *bio)
+{
+        unsigned long flags;
+        if (!bio_triggers_commit(cache, bio)) {
+                generic_make_request(bio);
+                return;
+        }
+        /*
+         * Batch together any bios that trigger commits and then issue a
+         * single commit for them in do_worker().
+         */
+        spin_lock_irqsave(&cache->lock, flags);
+        cache->commit_requested = true;
+        bio_list_add(&cache->deferred_flush_bios, bio);
+        spin_unlock_irqrestore(&cache->lock, flags);
+}
+/*----------------------------------------------------------------
+ * Migration processing
+ *
+ * Migration covers moving data from the origin device to the cache, or
+ * vice versa.
+ *--------------------------------------------------------------*/
+static void free_migration(struct dm_cache_migration *mg)
+{
+        mempool_free(mg, mg->cache->migration_pool);
+}
+static void inc_nr_migrations(struct cache *cache)
+{
+        atomic_inc(&cache->nr_migrations);
+}
+static void dec_nr_migrations(struct cache *cache)
+{
+        atomic_dec(&cache->nr_migrations);
+        /*
+         * Wake the worker in case we're suspending the target.
+         */
+        wake_up(&cache->migration_wait);
+}
+static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
+                         bool holder)
+{
+        (holder ? dm_cell_release : dm_cell_release_no_holder)
+                (cache->prison, cell, &cache->deferred_bios);
+        free_prison_cell(cache, cell);
+}
+static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
+                       bool holder)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&cache->lock, flags);
+        __cell_defer(cache, cell, holder);
+        spin_unlock_irqrestore(&cache->lock, flags);
+        wake_worker(cache);
+}
+static void cleanup_migration(struct dm_cache_migration *mg)
+{
+        dec_nr_migrations(mg->cache);
+        free_migration(mg);
+}
+static void migration_failure(struct dm_cache_migration *mg)
+{
+        struct cache *cache = mg->cache;
+        if (mg->writeback) {
+                DMWARN_LIMIT("writeback failed; couldn't copy block");
+                set_dirty(cache, mg->old_oblock, mg->cblock);
+                cell_defer(cache, mg->old_ocell, false);
+        } else if (mg->demote) {
+                DMWARN_LIMIT("demotion failed; couldn't copy block");
+                policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
+                cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
+                if (mg->promote)
+                        cell_defer(cache, mg->new_ocell, 1);
+        } else {
+                DMWARN_LIMIT("promotion failed; couldn't copy block");
+                policy_remove_mapping(cache->policy, mg->new_oblock);
+                cell_defer(cache, mg->new_ocell, 1);
+        }
+        cleanup_migration(mg);
+}
+static void migration_success_pre_commit(struct dm_cache_migration *mg)
+{
+        unsigned long flags;
+        struct cache *cache = mg->cache;
+        if (mg->writeback) {
+                cell_defer(cache, mg->old_ocell, false);
+                clear_dirty(cache, mg->old_oblock, mg->cblock);
+                cleanup_migration(mg);
+                return;
+        } else if (mg->demote) {
+                if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
+                        DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
+                        policy_force_mapping(cache->policy, mg->new_oblock,
+                                             mg->old_oblock);
+                        if (mg->promote)
+                                cell_defer(cache, mg->new_ocell, true);
+                        cleanup_migration(mg);
+                        return;
+                }
+        } else {
+                if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
+                        DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
+                        policy_remove_mapping(cache->policy, mg->new_oblock);
+                        cleanup_migration(mg);
+                        return;
+                }
+        }
+        spin_lock_irqsave(&cache->lock, flags);
+        list_add_tail(&mg->list, &cache->need_commit_migrations);
+        cache->commit_requested = true;
+        spin_unlock_irqrestore(&cache->lock, flags);
+}
+static void migration_success_post_commit(struct dm_cache_migration *mg)
+{
+        unsigned long flags;
+        struct cache *cache = mg->cache;
+        if (mg->writeback) {
+                DMWARN("writeback unexpectedly triggered commit");
+                return;
+        } else if (mg->demote) {
+                cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
+                if (mg->promote) {
+                        mg->demote = false;
+                        spin_lock_irqsave(&cache->lock, flags);
+                        list_add_tail(&mg->list, &cache->quiesced_migrations);
+                        spin_unlock_irqrestore(&cache->lock, flags);
+                } else
+                        cleanup_migration(mg);
+        } else {
+                cell_defer(cache, mg->new_ocell, true);
+                clear_dirty(cache, mg->new_oblock, mg->cblock);
+                cleanup_migration(mg);
+        }
+}
+static void copy_complete(int read_err, unsigned long write_err, void *context)
+{
+        unsigned long flags;
+        struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
+        struct cache *cache = mg->cache;
+        if (read_err || write_err)
+                mg->err = true;
+        spin_lock_irqsave(&cache->lock, flags);
+        list_add_tail(&mg->list, &cache->completed_migrations);
+        spin_unlock_irqrestore(&cache->lock, flags);
+        wake_worker(cache);
+}
+static void issue_copy_real(struct dm_cache_migration *mg)
+{
+        int r;
+        struct dm_io_region o_region, c_region;
+        struct cache *cache = mg->cache;
+        o_region.bdev = cache->origin_dev->bdev;
+        o_region.count = cache->sectors_per_block;
+        c_region.bdev = cache->cache_dev->bdev;
+        c_region.sector = from_cblock(mg->cblock) * cache->sectors_per_block;
+        c_region.count = cache->sectors_per_block;
+        if (mg->writeback || mg->demote) {
+                /* demote */
+                o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
+                r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
+        } else {
+                /* promote */
+                o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
+                r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
+        }
+        if (r < 0)
+                migration_failure(mg);
+}
+static void avoid_copy(struct dm_cache_migration *mg)
+{
+        atomic_inc(&mg->cache->stats.copies_avoided);
+        migration_success_pre_commit(mg);
+}
+static void issue_copy(struct dm_cache_migration *mg)
+{
+        bool avoid;
+        struct cache *cache = mg->cache;
+        if (mg->writeback || mg->demote)
+                avoid = !is_dirty(cache, mg->cblock) ||
+                        is_discarded_oblock(cache, mg->old_oblock);
+        else
+                avoid = is_discarded_oblock(cache, mg->new_oblock);
+        avoid ? avoid_copy(mg) : issue_copy_real(mg);
+}
+static void complete_migration(struct dm_cache_migration *mg)
+{
+        if (mg->err)
+                migration_failure(mg);
+        else
+                migration_success_pre_commit(mg);
+}
+static void process_migrations(struct cache *cache, struct list_head *head,
+                               void (*fn)(struct dm_cache_migration *))
+{
+        unsigned long flags;
+        struct list_head list;
+        struct dm_cache_migration *mg, *tmp;
+        INIT_LIST_HEAD(&list);
+        spin_lock_irqsave(&cache->lock, flags);
+        list_splice_init(head, &list);
+        spin_unlock_irqrestore(&cache->lock, flags);
+        list_for_each_entry_safe(mg, tmp, &list, list)
+                fn(mg);
+}
+static void __queue_quiesced_migration(struct dm_cache_migration *mg)
+{
+        list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
+}
+static void queue_quiesced_migration(struct dm_cache_migration *mg)
+{
+        unsigned long flags;
+        struct cache *cache = mg->cache;
+        spin_lock_irqsave(&cache->lock, flags);
+        __queue_quiesced_migration(mg);
+        spin_unlock_irqrestore(&cache->lock, flags);
+        wake_worker(cache);
+}
+static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
+{
+        unsigned long flags;
+        struct dm_cache_migration *mg, *tmp;
+        spin_lock_irqsave(&cache->lock, flags);
+        list_for_each_entry_safe(mg, tmp, work, list)
+                __queue_quiesced_migration(mg);
+        spin_unlock_irqrestore(&cache->lock, flags);
+        wake_worker(cache);
+}
+static void check_for_quiesced_migrations(struct cache *cache,
+                                          struct per_bio_data *pb)
+{
+        struct list_head work;
+        if (!pb->all_io_entry)
+                return;
+        INIT_LIST_HEAD(&work);
+        if (pb->all_io_entry)
+                dm_deferred_entry_dec(pb->all_io_entry, &work);
+        if (!list_empty(&work))
+                queue_quiesced_migrations(cache, &work);
+}
+static void quiesce_migration(struct dm_cache_migration *mg)
+{
+        if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
+                queue_quiesced_migration(mg);
+}
+static void promote(struct cache *cache, struct prealloc *structs,
+                    dm_oblock_t oblock, dm_cblock_t cblock,
+                    struct dm_bio_prison_cell *cell)
+{
+        struct dm_cache_migration *mg = prealloc_get_migration(structs);
+        mg->err = false;
+        mg->writeback = false;
+        mg->demote = false;
+        mg->promote = true;
+        mg->cache = cache;
+        mg->new_oblock = oblock;
+        mg->cblock = cblock;
+        mg->old_ocell = NULL;
+        mg->new_ocell = cell;
+        mg->start_jiffies = jiffies;
+        inc_nr_migrations(cache);
+        quiesce_migration(mg);
+}
+static void writeback(struct cache *cache, struct prealloc *structs,
+                      dm_oblock_t oblock, dm_cblock_t cblock,
+                      struct dm_bio_prison_cell *cell)
+{
+        struct dm_cache_migration *mg = prealloc_get_migration(structs);
+        mg->err = false;
+        mg->writeback = true;
+        mg->demote = false;
+        mg->promote = false;
+        mg->cache = cache;
+        mg->old_oblock = oblock;
+        mg->cblock = cblock;
+        mg->old_ocell = cell;
+        mg->new_ocell = NULL;
+        mg->start_jiffies = jiffies;
+        inc_nr_migrations(cache);
+        quiesce_migration(mg);
+}
+static void demote_then_promote(struct cache *cache, struct prealloc *structs,
+                                dm_oblock_t old_oblock, dm_oblock_t new_oblock,
+                                dm_cblock_t cblock,
+                                struct dm_bio_prison_cell *old_ocell,
+                                struct dm_bio_prison_cell *new_ocell)
+{
+        struct dm_cache_migration *mg = prealloc_get_migration(structs);
+        mg->err = false;
+        mg->writeback = false;
+        mg->demote = true;
+        mg->promote = true;
+        mg->cache = cache;
+        mg->old_oblock = old_oblock;
+        mg->new_oblock = new_oblock;
+        mg->cblock = cblock;
+        mg->old_ocell = old_ocell;
+        mg->new_ocell = new_ocell;
+        mg->start_jiffies = jiffies;
+        inc_nr_migrations(cache);
+        quiesce_migration(mg);
+}
+/*----------------------------------------------------------------
+ * bio processing
+ *--------------------------------------------------------------*/
+static void defer_bio(struct cache *cache, struct bio *bio)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&cache->lock, flags);
+        bio_list_add(&cache->deferred_bios, bio);
+        spin_unlock_irqrestore(&cache->lock, flags);
+        wake_worker(cache);
+}
+static void process_flush_bio(struct cache *cache, struct bio *bio)
+{
+        struct per_bio_data *pb = get_per_bio_data(bio);
+        BUG_ON(bio->bi_size);
+        if (!pb->req_nr)
+                remap_to_origin(cache, bio);
+        else
+                remap_to_cache(cache, bio, 0);
+        issue(cache, bio);
+}
+/*
+ * People generally discard large parts of a device, eg, the whole device
+ * when formatting.  Splitting these large discards up into cache block
+ * sized ios and then quiescing (always neccessary for discard) takes too
+ * long.
+ *
+ * We keep it simple, and allow any size of discard to come in, and just
+ * mark off blocks on the discard bitset.  No passdown occurs!
+ *
+ * To implement passdown we need to change the bio_prison such that a cell
+ * can have a key that spans many blocks.
+ */
+static void process_discard_bio(struct cache *cache, struct bio *bio)
+{
+        dm_block_t start_block = dm_sector_div_up(bio->bi_sector,
+                                                  cache->discard_block_size);
+        dm_block_t end_block = bio->bi_sector + bio_sectors(bio);
+        dm_block_t b;
+        (void) sector_div(end_block, cache->discard_block_size);
+        for (b = start_block; b < end_block; b++)
+                set_discard(cache, to_dblock(b));
+        bio_endio(bio, 0);
+}
+static bool spare_migration_bandwidth(struct cache *cache)
+{
+        sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) *
+                cache->sectors_per_block;
+        return current_volume < cache->migration_threshold;
+}
+static bool is_writethrough_io(struct cache *cache, struct bio *bio,
+                               dm_cblock_t cblock)
+{
+        return bio_data_dir(bio) == WRITE &&
+                cache->features.write_through && !is_dirty(cache, cblock);
+}
+static void inc_hit_counter(struct cache *cache, struct bio *bio)
+{
+        atomic_inc(bio_data_dir(bio) == READ ?
+                   &cache->stats.read_hit : &cache->stats.write_hit);
+}
+static void inc_miss_counter(struct cache *cache, struct bio *bio)
+{
+        atomic_inc(bio_data_dir(bio) == READ ?
+                   &cache->stats.read_miss : &cache->stats.write_miss);
+}
+static void process_bio(struct cache *cache, struct prealloc *structs,
+                        struct bio *bio)
+{
+        int r;
+        bool release_cell = true;
+        dm_oblock_t block = get_bio_block(cache, bio);
+        struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
+        struct policy_result lookup_result;
+        struct per_bio_data *pb = get_per_bio_data(bio);
+        bool discarded_block = is_discarded_oblock(cache, block);
+        bool can_migrate = discarded_block || spare_migration_bandwidth(cache);
+        /*
+         * Check to see if that block is currently migrating.
+         */
+        cell_prealloc = prealloc_get_cell(structs);
+        r = bio_detain(cache, block, bio, cell_prealloc,
+                       (cell_free_fn) prealloc_put_cell,
+                       structs, &new_ocell);
+        if (r > 0)
+                return;
+        r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
+                       bio, &lookup_result);
+        if (r == -EWOULDBLOCK)
+                /* migration has been denied */
+                lookup_result.op = POLICY_MISS;
+        switch (lookup_result.op) {
+        case POLICY_HIT:
+                inc_hit_counter(cache, bio);
+                pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+                if (is_writethrough_io(cache, bio, lookup_result.cblock)) {
+                        /*
+                         * No need to mark anything dirty in write through mode.
+                         */
+                        pb->req_nr == 0 ?
+                                remap_to_cache(cache, bio, lookup_result.cblock) :
+                                remap_to_origin_clear_discard(cache, bio, block);
+                } else
+                        remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
+                issue(cache, bio);
+                break;
+        case POLICY_MISS:
+                inc_miss_counter(cache, bio);
+                pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+                if (pb->req_nr != 0) {
+                        /*
+                         * This is a duplicate writethrough io that is no
+                         * longer needed because the block has been demoted.
+                         */
+                        bio_endio(bio, 0);
+                } else {
+                        remap_to_origin_clear_discard(cache, bio, block);
+                        issue(cache, bio);
+                }
+                break;
+        case POLICY_NEW:
+                atomic_inc(&cache->stats.promotion);
+                promote(cache, structs, block, lookup_result.cblock, new_ocell);
+                release_cell = false;
+                break;
+        case POLICY_REPLACE:
+                cell_prealloc = prealloc_get_cell(structs);
+                r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc,
+                               (cell_free_fn) prealloc_put_cell,
+                               structs, &old_ocell);
+                if (r > 0) {
+                        /*
+                         * We have to be careful to avoid lock inversion of
+                         * the cells.  So we back off, and wait for the
+                         * old_ocell to become free.
+                         */
+                        policy_force_mapping(cache->policy, block,
+                                             lookup_result.old_oblock);
+                        atomic_inc(&cache->stats.cache_cell_clash);
+                        break;
+                }
+                atomic_inc(&cache->stats.demotion);
+                atomic_inc(&cache->stats.promotion);
+                demote_then_promote(cache, structs, lookup_result.old_oblock,
+                                    block, lookup_result.cblock,
+                                    old_ocell, new_ocell);
+                release_cell = false;
+                break;
+        default:
+                DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
+                            (unsigned) lookup_result.op);
+                bio_io_error(bio);
+        }
+        if (release_cell)
+                cell_defer(cache, new_ocell, false);
+}
+static int need_commit_due_to_time(struct cache *cache)
+{
+        return jiffies < cache->last_commit_jiffies ||
+               jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
+}
+static int commit_if_needed(struct cache *cache)
+{
+        if (dm_cache_changed_this_transaction(cache->cmd) &&
+            (cache->commit_requested || need_commit_due_to_time(cache))) {
+                atomic_inc(&cache->stats.commit_count);
+                cache->last_commit_jiffies = jiffies;
+                cache->commit_requested = false;
+                return dm_cache_commit(cache->cmd, false);
+        }
+        return 0;
+}
+static void process_deferred_bios(struct cache *cache)
+{
+        unsigned long flags;
+        struct bio_list bios;
+        struct bio *bio;
+        struct prealloc structs;
+        memset(&structs, 0, sizeof(structs));
+        bio_list_init(&bios);
+        spin_lock_irqsave(&cache->lock, flags);
+        bio_list_merge(&bios, &cache->deferred_bios);
+        bio_list_init(&cache->deferred_bios);
+        spin_unlock_irqrestore(&cache->lock, flags);
+        while (!bio_list_empty(&bios)) {
+                /*
+                 * If we've got no free migration structs, and processing
+                 * this bio might require one, we pause until there are some
+                 * prepared mappings to process.
+                 */
+                if (prealloc_data_structs(cache, &structs)) {
+                        spin_lock_irqsave(&cache->lock, flags);
+                        bio_list_merge(&cache->deferred_bios, &bios);
+                        spin_unlock_irqrestore(&cache->lock, flags);
+                        break;
+                }
+                bio = bio_list_pop(&bios);
+                if (bio->bi_rw & REQ_FLUSH)
+                        process_flush_bio(cache, bio);
+                else if (bio->bi_rw & REQ_DISCARD)
+                        process_discard_bio(cache, bio);
+                else
+                        process_bio(cache, &structs, bio);
+        }
+        prealloc_free_structs(cache, &structs);
+}
+static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
+{
+        unsigned long flags;
+        struct bio_list bios;
+        struct bio *bio;
+        bio_list_init(&bios);
+        spin_lock_irqsave(&cache->lock, flags);
+        bio_list_merge(&bios, &cache->deferred_flush_bios);
+        bio_list_init(&cache->deferred_flush_bios);
+        spin_unlock_irqrestore(&cache->lock, flags);
+        while ((bio = bio_list_pop(&bios)))
+                submit_bios ? generic_make_request(bio) : bio_io_error(bio);
+}
+static void writeback_some_dirty_blocks(struct cache *cache)
+{
+        int r = 0;
+        dm_oblock_t oblock;
+        dm_cblock_t cblock;
+        struct prealloc structs;
+        struct dm_bio_prison_cell *old_ocell;
+        memset(&structs, 0, sizeof(structs));
+        while (spare_migration_bandwidth(cache)) {
+                if (prealloc_data_structs(cache, &structs))
+                        break;
+                r = policy_writeback_work(cache->policy, &oblock, &cblock);
+                if (r)
+                        break;
+                r = get_cell(cache, oblock, &structs, &old_ocell);
+                if (r) {
+                        policy_set_dirty(cache->policy, oblock);
+                        break;
+                }
+                writeback(cache, &structs, oblock, cblock, old_ocell);
+        }
+        prealloc_free_structs(cache, &structs);
+}
+/*----------------------------------------------------------------
+ * Main worker loop
+ *--------------------------------------------------------------*/
+static void start_quiescing(struct cache *cache)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&cache->lock, flags);
+        cache->quiescing = 1;
+        spin_unlock_irqrestore(&cache->lock, flags);
+}
+static void stop_quiescing(struct cache *cache)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&cache->lock, flags);
+        cache->quiescing = 0;
+        spin_unlock_irqrestore(&cache->lock, flags);
+}
+static bool is_quiescing(struct cache *cache)
+{
+        int r;
+        unsigned long flags;
+        spin_lock_irqsave(&cache->lock, flags);
+        r = cache->quiescing;
+        spin_unlock_irqrestore(&cache->lock, flags);
+        return r;
+}
+static void wait_for_migrations(struct cache *cache)
+{
+        wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations));
+}
+static void stop_worker(struct cache *cache)
+{
+        cancel_delayed_work(&cache->waker);
+        flush_workqueue(cache->wq);
+}
+static void requeue_deferred_io(struct cache *cache)
+{
+        struct bio *bio;
+        struct bio_list bios;
+        bio_list_init(&bios);
+        bio_list_merge(&bios, &cache->deferred_bios);
+        bio_list_init(&cache->deferred_bios);
+        while ((bio = bio_list_pop(&bios)))
+                bio_endio(bio, DM_ENDIO_REQUEUE);
+}
+static int more_work(struct cache *cache)
+{
+        if (is_quiescing(cache))
+                return !list_empty(&cache->quiesced_migrations) ||
+                        !list_empty(&cache->completed_migrations) ||
+                        !list_empty(&cache->need_commit_migrations);
+        else
+                return !bio_list_empty(&cache->deferred_bios) ||
+                        !bio_list_empty(&cache->deferred_flush_bios) ||
+                        !list_empty(&cache->quiesced_migrations) ||
+                        !list_empty(&cache->completed_migrations) ||
+                        !list_empty(&cache->need_commit_migrations);
+}
+static void do_worker(struct work_struct *ws)
+{
+        struct cache *cache = container_of(ws, struct cache, worker);
+        do {
+                if (!is_quiescing(cache))
+                        process_deferred_bios(cache);
+                process_migrations(cache, &cache->quiesced_migrations, issue_copy);
+                process_migrations(cache, &cache->completed_migrations, complete_migration);
+                writeback_some_dirty_blocks(cache);
+                if (commit_if_needed(cache)) {
+                        process_deferred_flush_bios(cache, false);
+                        /*
+                         * FIXME: rollback metadata or just go into a
+                         * failure mode and error everything
+                         */
+                } else {
+                        process_deferred_flush_bios(cache, true);
+                        process_migrations(cache, &cache->need_commit_migrations,
+                                           migration_success_post_commit);
+                }
+        } while (more_work(cache));
+}
+/*
+ * We want to commit periodically so that not too much
+ * unwritten metadata builds up.
+ */
+static void do_waker(struct work_struct *ws)
+{
+        struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
+        wake_worker(cache);
+        queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
+}
+/*----------------------------------------------------------------*/
+static int is_congested(struct dm_dev *dev, int bdi_bits)
+{
+        struct request_queue *q = bdev_get_queue(dev->bdev);
+        return bdi_congested(&q->backing_dev_info, bdi_bits);
+}
+static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
+{
+        struct cache *cache = container_of(cb, struct cache, callbacks);
+        return is_congested(cache->origin_dev, bdi_bits) ||
+                is_congested(cache->cache_dev, bdi_bits);
+}
+/*----------------------------------------------------------------
+ * Target methods
+ *--------------------------------------------------------------*/
+/*
+ * This function gets called on the error paths of the constructor, so we
+ * have to cope with a partially initialised struct.
+ */
+static void destroy(struct cache *cache)
+{
+        unsigned i;
+        if (cache->next_migration)
+                mempool_free(cache->next_migration, cache->migration_pool);
+        if (cache->migration_pool)
+                mempool_destroy(cache->migration_pool);
+        if (cache->all_io_ds)
+                dm_deferred_set_destroy(cache->all_io_ds);
+        if (cache->prison)
+                dm_bio_prison_destroy(cache->prison);
+        if (cache->wq)
+                destroy_workqueue(cache->wq);
+        if (cache->dirty_bitset)
+                free_bitset(cache->dirty_bitset);
+        if (cache->discard_bitset)
+                free_bitset(cache->discard_bitset);
+        if (cache->copier)
+                dm_kcopyd_client_destroy(cache->copier);
+        if (cache->cmd)
+                dm_cache_metadata_close(cache->cmd);
+        if (cache->metadata_dev)
+                dm_put_device(cache->ti, cache->metadata_dev);
+        if (cache->origin_dev)
+                dm_put_device(cache->ti, cache->origin_dev);
+        if (cache->cache_dev)
+                dm_put_device(cache->ti, cache->cache_dev);
+        if (cache->policy)
+                dm_cache_policy_destroy(cache->policy);
+        for (i = 0; i < cache->nr_ctr_args ; i++)
+                kfree(cache->ctr_args[i]);
+        kfree(cache->ctr_args);
+        kfree(cache);
+}
+static void cache_dtr(struct dm_target *ti)
+{
+        struct cache *cache = ti->private;
+        destroy(cache);
+}
+static sector_t get_dev_size(struct dm_dev *dev)
+{
+        return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
+}
+/*----------------------------------------------------------------*/
+/*
+ * Construct a cache device mapping.
+ *
+ * cache <metadata dev> <cache dev> <origin dev> <block size>
+ *       <#feature args> [<feature arg>]*
+ *       <policy> <#policy args> [<policy arg>]*
+ *
+ * metadata dev    : fast device holding the persistent metadata
+ * cache dev       : fast device holding cached data blocks
+ * origin dev      : slow device holding original data blocks
+ * block size      : cache unit size in sectors
+ *
+ * #feature args   : number of feature arguments passed
+ * feature args    : writethrough.  (The default is writeback.)
+ *
+ * policy          : the replacement policy to use
+ * #policy args    : an even number of policy arguments corresponding
+ *                   to key/value pairs passed to the policy
+ * policy args     : key/value pairs passed to the policy
+ *                   E.g. 'sequential_threshold 1024'
+ *                   See cache-policies.txt for details.
+ *
+ * Optional feature arguments are:
+ *   writethrough  : write through caching that prohibits cache block
+ *                   content from being different from origin block content.
+ *                   Without this argument, the default behaviour is to write
+ *                   back cache block contents later for performance reasons,
+ *                   so they may differ from the corresponding origin blocks.
+ */
+struct cache_args {
+        struct dm_target *ti;
+        struct dm_dev *metadata_dev;
+        struct dm_dev *cache_dev;
+        sector_t cache_sectors;
+        struct dm_dev *origin_dev;
+        sector_t origin_sectors;
+        uint32_t block_size;
+        const char *policy_name;
+        int policy_argc;
+        const char **policy_argv;
+        struct cache_features features;
+};
+static void destroy_cache_args(struct cache_args *ca)
+{
+        if (ca->metadata_dev)
+                dm_put_device(ca->ti, ca->metadata_dev);
+        if (ca->cache_dev)
+                dm_put_device(ca->ti, ca->cache_dev);
+        if (ca->origin_dev)
+                dm_put_device(ca->ti, ca->origin_dev);
+        kfree(ca);
+}
+static bool at_least_one_arg(struct dm_arg_set *as, char **error)
+{
+        if (!as->argc) {
+                *error = "Insufficient args";
+                return false;
+        }
+        return true;
+}
+static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
+                              char **error)
+{
+        int r;
+        sector_t metadata_dev_size;
+        char b[BDEVNAME_SIZE];
+        if (!at_least_one_arg(as, error))
+                return -EINVAL;
+        r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
+                          &ca->metadata_dev);
+        if (r) {
+                *error = "Error opening metadata device";
+                return r;
+        }
+        metadata_dev_size = get_dev_size(ca->metadata_dev);
+        if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
+                DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
+                       bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
+        return 0;
+}
+static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
+                           char **error)
+{
+        int r;
+        if (!at_least_one_arg(as, error))
+                return -EINVAL;
+        r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
+                          &ca->cache_dev);
+        if (r) {
+                *error = "Error opening cache device";
+                return r;
+        }
+        ca->cache_sectors = get_dev_size(ca->cache_dev);
+        return 0;
+}
+static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
+                            char **error)
+{
+        int r;
+        if (!at_least_one_arg(as, error))
+                return -EINVAL;
+        r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
+                          &ca->origin_dev);
+        if (r) {
+                *error = "Error opening origin device";
+                return r;
+        }
+        ca->origin_sectors = get_dev_size(ca->origin_dev);
+        if (ca->ti->len > ca->origin_sectors) {
+                *error = "Device size larger than cached device";
+                return -EINVAL;
+        }
+        return 0;
+}
+static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
+                            char **error)
+{
+        unsigned long tmp;
+        if (!at_least_one_arg(as, error))
+                return -EINVAL;
+        if (kstrtoul(dm_shift_arg(as), 10, &tmp) || !tmp ||
+            tmp < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
+            tmp & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
+                *error = "Invalid data block size";
+                return -EINVAL;
+        }
+        if (tmp > ca->cache_sectors) {
+                *error = "Data block size is larger than the cache device";
+                return -EINVAL;
+        }
+        ca->block_size = tmp;
+        return 0;
+}
+static void init_features(struct cache_features *cf)
+{
+        cf->mode = CM_WRITE;
+        cf->write_through = false;
+}
+static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
+                          char **error)
+{
+        static struct dm_arg _args[] = {
+                {0, 1, "Invalid number of cache feature arguments"},
+        };
+        int r;
+        unsigned argc;
+        const char *arg;
+        struct cache_features *cf = &ca->features;
+        init_features(cf);
+        r = dm_read_arg_group(_args, as, &argc, error);
+        if (r)
+                return -EINVAL;
+        while (argc--) {
+                arg = dm_shift_arg(as);
+                if (!strcasecmp(arg, "writeback"))
+                        cf->write_through = false;
+                else if (!strcasecmp(arg, "writethrough"))
+                        cf->write_through = true;
+                else {
+                        *error = "Unrecognised cache feature requested";
+                        return -EINVAL;
+                }
+        }
+        return 0;
+}
+static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
+                        char **error)
+{
+        static struct dm_arg _args[] = {
+                {0, 1024, "Invalid number of policy arguments"},
+        };
+        int r;
+        if (!at_least_one_arg(as, error))
+                return -EINVAL;
+        ca->policy_name = dm_shift_arg(as);
+        r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
+        if (r)
+                return -EINVAL;
+        ca->policy_argv = (const char **)as->argv;
+        dm_consume_args(as, ca->policy_argc);
+        return 0;
+}
+static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
+                            char **error)
+{
+        int r;
+        struct dm_arg_set as;
+        as.argc = argc;
+        as.argv = argv;
+        r = parse_metadata_dev(ca, &as, error);
+        if (r)
+                return r;
+        r = parse_cache_dev(ca, &as, error);
+        if (r)
+                return r;
+        r = parse_origin_dev(ca, &as, error);
+        if (r)
+                return r;
+        r = parse_block_size(ca, &as, error);
+        if (r)
+                return r;
+        r = parse_features(ca, &as, error);
+        if (r)
+                return r;
+        r = parse_policy(ca, &as, error);
+        if (r)
+                return r;
+        return 0;
+}
+/*----------------------------------------------------------------*/
+static struct kmem_cache *migration_cache;
+static int set_config_values(struct dm_cache_policy *p, int argc, const char **argv)
+{
+        int r = 0;
+        if (argc & 1) {
+                DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
+                return -EINVAL;
+        }
+        while (argc) {
+                r = policy_set_config_value(p, argv[0], argv[1]);
+                if (r) {
+                        DMWARN("policy_set_config_value failed: key = '%s', value = '%s'",
+                               argv[0], argv[1]);
+                        return r;
+                }
+                argc -= 2;
+                argv += 2;
+        }
+        return r;
+}
+static int create_cache_policy(struct cache *cache, struct cache_args *ca,
+                               char **error)
+{
+        int r;
+        cache->policy = dm_cache_policy_create(ca->policy_name,
+                                               cache->cache_size,
+                                               cache->origin_sectors,
+                                               cache->sectors_per_block);
+        if (!cache->policy) {
+                *error = "Error creating cache's policy";
+                return -ENOMEM;
+        }
+        r = set_config_values(cache->policy, ca->policy_argc, ca->policy_argv);
+        if (r)
+                dm_cache_policy_destroy(cache->policy);
+        return r;
+}
+/*
+ * We want the discard block size to be a power of two, at least the size
+ * of the cache block size, and have no more than 2^14 discard blocks
+ * across the origin.
+ */
+#define MAX_DISCARD_BLOCKS (1 << 14)
+static bool too_many_discard_blocks(sector_t discard_block_size,
+                                    sector_t origin_size)
+{
+        (void) sector_div(origin_size, discard_block_size);
+        return origin_size > MAX_DISCARD_BLOCKS;
+}
+static sector_t calculate_discard_block_size(sector_t cache_block_size,
+                                             sector_t origin_size)
+{
+        sector_t discard_block_size;
+        discard_block_size = roundup_pow_of_two(cache_block_size);
+        if (origin_size)
+                while (too_many_discard_blocks(discard_block_size, origin_size))
+                        discard_block_size *= 2;
+        return discard_block_size;
+}
+#define DEFAULT_MIGRATION_THRESHOLD (2048 * 100)
+static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio);
+static int cache_create(struct cache_args *ca, struct cache **result)
+{
+        int r = 0;
+        char **error = &ca->ti->error;
+        struct cache *cache;
+        struct dm_target *ti = ca->ti;
+        dm_block_t origin_blocks;
+        struct dm_cache_metadata *cmd;
+        bool may_format = ca->features.mode == CM_WRITE;
+        cache = kzalloc(sizeof(*cache), GFP_KERNEL);
+        if (!cache)
+                return -ENOMEM;
+        cache->ti = ca->ti;
+        ti->private = cache;
+        ti->per_bio_data_size = sizeof(struct per_bio_data);
+        ti->num_flush_bios = 2;
+        ti->flush_supported = true;
+        ti->num_discard_bios = 1;
+        ti->discards_supported = true;
+        ti->discard_zeroes_data_unsupported = true;
+        memcpy(&cache->features, &ca->features, sizeof(cache->features));
+        if (cache->features.write_through)
+                ti->num_write_bios = cache_num_write_bios;
+        cache->callbacks.congested_fn = cache_is_congested;
+        dm_table_add_target_callbacks(ti->table, &cache->callbacks);
+        cache->metadata_dev = ca->metadata_dev;
+        cache->origin_dev = ca->origin_dev;
+        cache->cache_dev = ca->cache_dev;
+        ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
+        /* FIXME: factor out this whole section */
+        origin_blocks = cache->origin_sectors = ca->origin_sectors;
+        (void) sector_div(origin_blocks, ca->block_size);
+        cache->origin_blocks = to_oblock(origin_blocks);
+        cache->sectors_per_block = ca->block_size;
+        if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
+                r = -EINVAL;
+                goto bad;
+        }
+        if (ca->block_size & (ca->block_size - 1)) {
+                dm_block_t cache_size = ca->cache_sectors;
+                cache->sectors_per_block_shift = -1;
+                (void) sector_div(cache_size, ca->block_size);
+                cache->cache_size = to_cblock(cache_size);
+        } else {
+                cache->sectors_per_block_shift = __ffs(ca->block_size);
+                cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift);
+        }
+        r = create_cache_policy(cache, ca, error);
+        if (r)
+                goto bad;
+        cache->policy_nr_args = ca->policy_argc;
+        cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
+                                     ca->block_size, may_format,
+                                     dm_cache_policy_get_hint_size(cache->policy));
+        if (IS_ERR(cmd)) {
+                *error = "Error creating metadata object";
+                r = PTR_ERR(cmd);
+                goto bad;
+        }
+        cache->cmd = cmd;
+        spin_lock_init(&cache->lock);
+        bio_list_init(&cache->deferred_bios);
+        bio_list_init(&cache->deferred_flush_bios);
+        INIT_LIST_HEAD(&cache->quiesced_migrations);
+        INIT_LIST_HEAD(&cache->completed_migrations);
+        INIT_LIST_HEAD(&cache->need_commit_migrations);
+        cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
+        atomic_set(&cache->nr_migrations, 0);
+        init_waitqueue_head(&cache->migration_wait);
+        cache->nr_dirty = 0;
+        cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
+        if (!cache->dirty_bitset) {
+                *error = "could not allocate dirty bitset";
+                goto bad;
+        }
+        clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
+        cache->discard_block_size =
+                calculate_discard_block_size(cache->sectors_per_block,
+                                             cache->origin_sectors);
+        cache->discard_nr_blocks = oblock_to_dblock(cache, cache->origin_blocks);
+        cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
+        if (!cache->discard_bitset) {
+                *error = "could not allocate discard bitset";
+                goto bad;
+        }
+        clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
+        cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
+        if (IS_ERR(cache->copier)) {
+                *error = "could not create kcopyd client";
+                r = PTR_ERR(cache->copier);
+                goto bad;
+        }
+        cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
+        if (!cache->wq) {
+                *error = "could not create workqueue for metadata object";
+                goto bad;
+        }
+        INIT_WORK(&cache->worker, do_worker);
+        INIT_DELAYED_WORK(&cache->waker, do_waker);
+        cache->last_commit_jiffies = jiffies;
+        cache->prison = dm_bio_prison_create(PRISON_CELLS);
+        if (!cache->prison) {
+                *error = "could not create bio prison";
+                goto bad;
+        }
+        cache->all_io_ds = dm_deferred_set_create();
+        if (!cache->all_io_ds) {
+                *error = "could not create all_io deferred set";
+                goto bad;
+        }
+        cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
+                                                         migration_cache);
+        if (!cache->migration_pool) {
+                *error = "Error creating cache's migration mempool";
+                goto bad;
+        }
+        cache->next_migration = NULL;
+        cache->need_tick_bio = true;
+        cache->sized = false;
+        cache->quiescing = false;
+        cache->commit_requested = false;
+        cache->loaded_mappings = false;
+        cache->loaded_discards = false;
+        load_stats(cache);
+        atomic_set(&cache->stats.demotion, 0);
+        atomic_set(&cache->stats.promotion, 0);
+        atomic_set(&cache->stats.copies_avoided, 0);
+        atomic_set(&cache->stats.cache_cell_clash, 0);
+        atomic_set(&cache->stats.commit_count, 0);
+        atomic_set(&cache->stats.discard_count, 0);
+        *result = cache;
+        return 0;
+bad:
+        destroy(cache);
+        return r;
+}
+static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
+{
+        unsigned i;
+        const char **copy;
+        copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
+        if (!copy)
+                return -ENOMEM;
+        for (i = 0; i < argc; i++) {
+                copy[i] = kstrdup(argv[i], GFP_KERNEL);
+                if (!copy[i]) {
+                        while (i--)
+                                kfree(copy[i]);
+                        kfree(copy);
+                        return -ENOMEM;
+                }
+        }
+        cache->nr_ctr_args = argc;
+        cache->ctr_args = copy;
+        return 0;
+}
+static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+        int r = -EINVAL;
+        struct cache_args *ca;
+        struct cache *cache = NULL;
+        ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+        if (!ca) {
+                ti->error = "Error allocating memory for cache";
+                return -ENOMEM;
+        }
+        ca->ti = ti;
+        r = parse_cache_args(ca, argc, argv, &ti->error);
+        if (r)
+                goto out;
+        r = cache_create(ca, &cache);
+        r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
+        if (r) {
+                destroy(cache);
+                goto out;
+        }
+        ti->private = cache;
+out:
+        destroy_cache_args(ca);
+        return r;
+}
+static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio)
+{
+        int r;
+        struct cache *cache = ti->private;
+        dm_oblock_t block = get_bio_block(cache, bio);
+        dm_cblock_t cblock;
+        r = policy_lookup(cache->policy, block, &cblock);
+        if (r < 0)
+                return 2;       /* assume the worst */
+        return (!r && !is_dirty(cache, cblock)) ? 2 : 1;
+}
+static int cache_map(struct dm_target *ti, struct bio *bio)
+{
+        struct cache *cache = ti->private;
+        int r;
+        dm_oblock_t block = get_bio_block(cache, bio);
+        bool can_migrate = false;
+        bool discarded_block;
+        struct dm_bio_prison_cell *cell;
+        struct policy_result lookup_result;
+        struct per_bio_data *pb;
+        if (from_oblock(block) > from_oblock(cache->origin_blocks)) {
+                /*
+                 * This can only occur if the io goes to a partial block at
+                 * the end of the origin device.  We don't cache these.
+                 * Just remap to the origin and carry on.
+                 */
+                remap_to_origin_clear_discard(cache, bio, block);
+                return DM_MAPIO_REMAPPED;
+        }
+        pb = init_per_bio_data(bio);
+        if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
+                defer_bio(cache, bio);
+                return DM_MAPIO_SUBMITTED;
+        }
+        /*
+         * Check to see if that block is currently migrating.
+         */
+        cell = alloc_prison_cell(cache);
+        if (!cell) {
+                defer_bio(cache, bio);
+                return DM_MAPIO_SUBMITTED;
+        }
+        r = bio_detain(cache, block, bio, cell,
+                       (cell_free_fn) free_prison_cell,
+                       cache, &cell);
+        if (r) {
+                if (r < 0)
+                        defer_bio(cache, bio);
+                return DM_MAPIO_SUBMITTED;
+        }
+        discarded_block = is_discarded_oblock(cache, block);
+        r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
+                       bio, &lookup_result);
+        if (r == -EWOULDBLOCK) {
+                cell_defer(cache, cell, true);
+                return DM_MAPIO_SUBMITTED;
+        } else if (r) {
+                DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
+                bio_io_error(bio);
+                return DM_MAPIO_SUBMITTED;
+        }
+        switch (lookup_result.op) {
+        case POLICY_HIT:
+                inc_hit_counter(cache, bio);
+                pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+                if (is_writethrough_io(cache, bio, lookup_result.cblock)) {
+                        /*
+                         * No need to mark anything dirty in write through mode.
+                         */
+                        pb->req_nr == 0 ?
+                                remap_to_cache(cache, bio, lookup_result.cblock) :
+                                remap_to_origin_clear_discard(cache, bio, block);
+                        cell_defer(cache, cell, false);
+                } else {
+                        remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
+                        cell_defer(cache, cell, false);
+                }
+                break;
+        case POLICY_MISS:
+                inc_miss_counter(cache, bio);
+                pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+                if (pb->req_nr != 0) {
+                        /*
+                         * This is a duplicate writethrough io that is no
+                         * longer needed because the block has been demoted.
+                         */
+                        bio_endio(bio, 0);
+                        cell_defer(cache, cell, false);
+                        return DM_MAPIO_SUBMITTED;
+                } else {
+                        remap_to_origin_clear_discard(cache, bio, block);
+                        cell_defer(cache, cell, false);
+                }
+                break;
+        default:
+                DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
+                            (unsigned) lookup_result.op);
+                bio_io_error(bio);
+                return DM_MAPIO_SUBMITTED;
+        }
+        return DM_MAPIO_REMAPPED;
+}
+static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
+{
+        struct cache *cache = ti->private;
+        unsigned long flags;
+        struct per_bio_data *pb = get_per_bio_data(bio);
+        if (pb->tick) {
+                policy_tick(cache->policy);
+                spin_lock_irqsave(&cache->lock, flags);
+                cache->need_tick_bio = true;
+                spin_unlock_irqrestore(&cache->lock, flags);
+        }
+        check_for_quiesced_migrations(cache, pb);
+        return 0;
+}
+static int write_dirty_bitset(struct cache *cache)
+{
+        unsigned i, r;
+        for (i = 0; i < from_cblock(cache->cache_size); i++) {
+                r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
+                                       is_dirty(cache, to_cblock(i)));
+                if (r)
+                        return r;
+        }
+        return 0;
+}
+static int write_discard_bitset(struct cache *cache)
+{
+        unsigned i, r;
+        r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
+                                           cache->discard_nr_blocks);
+        if (r) {
+                DMERR("could not resize on-disk discard bitset");
+                return r;
+        }
+        for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
+                r = dm_cache_set_discard(cache->cmd, to_dblock(i),
+                                         is_discarded(cache, to_dblock(i)));
+                if (r)
+                        return r;
+        }
+        return 0;
+}
+static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock,
+                     uint32_t hint)
+{
+        struct cache *cache = context;
+        return dm_cache_save_hint(cache->cmd, cblock, hint);
+}
+static int write_hints(struct cache *cache)
+{
+        int r;
+        r = dm_cache_begin_hints(cache->cmd, cache->policy);
+        if (r) {
+                DMERR("dm_cache_begin_hints failed");
+                return r;
+        }
+        r = policy_walk_mappings(cache->policy, save_hint, cache);
+        if (r)
+                DMERR("policy_walk_mappings failed");
+        return r;
+}
+/*
+ * returns true on success
+ */
+static bool sync_metadata(struct cache *cache)
+{
+        int r1, r2, r3, r4;
+        r1 = write_dirty_bitset(cache);
+        if (r1)
+                DMERR("could not write dirty bitset");
+        r2 = write_discard_bitset(cache);
+        if (r2)
+                DMERR("could not write discard bitset");
+        save_stats(cache);
+        r3 = write_hints(cache);
+        if (r3)
+                DMERR("could not write hints");
+        /*
+         * If writing the above metadata failed, we still commit, but don't
+         * set the clean shutdown flag.  This will effectively force every
+         * dirty bit to be set on reload.
+         */
+        r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
+        if (r4)
+                DMERR("could not write cache metadata.  Data loss may occur.");
+        return !r1 && !r2 && !r3 && !r4;
+}
+static void cache_postsuspend(struct dm_target *ti)
+{
+        struct cache *cache = ti->private;
+        start_quiescing(cache);
+        wait_for_migrations(cache);
+        stop_worker(cache);
+        requeue_deferred_io(cache);
+        stop_quiescing(cache);
+        (void) sync_metadata(cache);
+}
+static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
+                        bool dirty, uint32_t hint, bool hint_valid)
+{
+        int r;
+        struct cache *cache = context;
+        r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
+        if (r)
+                return r;
+        if (dirty)
+                set_dirty(cache, oblock, cblock);
+        else
+                clear_dirty(cache, oblock, cblock);
+        return 0;
+}
+static int load_discard(void *context, sector_t discard_block_size,
+                        dm_dblock_t dblock, bool discard)
+{
+        struct cache *cache = context;
+        /* FIXME: handle mis-matched block size */
+        if (discard)
+                set_discard(cache, dblock);
+        else
+                clear_discard(cache, dblock);
+        return 0;
+}
+static int cache_preresume(struct dm_target *ti)
+{
+        int r = 0;
+        struct cache *cache = ti->private;
+        sector_t actual_cache_size = get_dev_size(cache->cache_dev);
+        (void) sector_div(actual_cache_size, cache->sectors_per_block);
+        /*
+         * Check to see if the cache has resized.
+         */
+        if (from_cblock(cache->cache_size) != actual_cache_size || !cache->sized) {
+                cache->cache_size = to_cblock(actual_cache_size);
+                r = dm_cache_resize(cache->cmd, cache->cache_size);
+                if (r) {
+                        DMERR("could not resize cache metadata");
+                        return r;
+                }
+                cache->sized = true;
+        }
+        if (!cache->loaded_mappings) {
+                r = dm_cache_load_mappings(cache->cmd,
+                                           dm_cache_policy_get_name(cache->policy),
+                                           load_mapping, cache);
+                if (r) {
+                        DMERR("could not load cache mappings");
+                        return r;
+                }
+                cache->loaded_mappings = true;
+        }
+        if (!cache->loaded_discards) {
+                r = dm_cache_load_discards(cache->cmd, load_discard, cache);
+                if (r) {
+                        DMERR("could not load origin discards");
+                        return r;
+                }
+                cache->loaded_discards = true;
+        }
+        return r;
+}
+static void cache_resume(struct dm_target *ti)
+{
+        struct cache *cache = ti->private;
+        cache->need_tick_bio = true;
+        do_waker(&cache->waker.work);
+}
+/*
+ * Status format:
+ *
+ * <#used metadata blocks>/<#total metadata blocks>
+ * <#read hits> <#read misses> <#write hits> <#write misses>
+ * <#demotions> <#promotions> <#blocks in cache> <#dirty>
+ * <#features> <features>*
+ * <#core args> <core args>
+ * <#policy args> <policy args>*
+ */
+static void cache_status(struct dm_target *ti, status_type_t type,
+                         unsigned status_flags, char *result, unsigned maxlen)
+{
+        int r = 0;
+        unsigned i;
+        ssize_t sz = 0;
+        dm_block_t nr_free_blocks_metadata = 0;
+        dm_block_t nr_blocks_metadata = 0;
+        char buf[BDEVNAME_SIZE];
+        struct cache *cache = ti->private;
+        dm_cblock_t residency;
+        switch (type) {
+        case STATUSTYPE_INFO:
+                /* Commit to ensure statistics aren't out-of-date */
+                if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
+                        r = dm_cache_commit(cache->cmd, false);
+                        if (r)
+                                DMERR("could not commit metadata for accurate status");
+                }
+                r = dm_cache_get_free_metadata_block_count(cache->cmd,
+                                                           &nr_free_blocks_metadata);
+                if (r) {
+                        DMERR("could not get metadata free block count");
+                        goto err;
+                }
+                r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
+                if (r) {
+                        DMERR("could not get metadata device size");
+                        goto err;
+                }
+                residency = policy_residency(cache->policy);
+                DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ",
+                       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
+                       (unsigned long long)nr_blocks_metadata,
+                       (unsigned) atomic_read(&cache->stats.read_hit),
+                       (unsigned) atomic_read(&cache->stats.read_miss),
+                       (unsigned) atomic_read(&cache->stats.write_hit),
+                       (unsigned) atomic_read(&cache->stats.write_miss),
+                       (unsigned) atomic_read(&cache->stats.demotion),
+                       (unsigned) atomic_read(&cache->stats.promotion),
+                       (unsigned long long) from_cblock(residency),
+                       cache->nr_dirty);
+                if (cache->features.write_through)
+                        DMEMIT("1 writethrough ");
+                else
+                        DMEMIT("0 ");
+                DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
+                if (sz < maxlen) {
+                        r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
+                        if (r)
+                                DMERR("policy_emit_config_values returned %d", r);
+                }
+                break;
+        case STATUSTYPE_TABLE:
+                format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
+                DMEMIT("%s ", buf);
+                format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
+                DMEMIT("%s ", buf);
+                format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
+                DMEMIT("%s", buf);
+                for (i = 0; i < cache->nr_ctr_args - 1; i++)
+                        DMEMIT(" %s", cache->ctr_args[i]);
+                if (cache->nr_ctr_args)
+                        DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
+        }
+        return;
+err:
+        DMEMIT("Error");
+}
+#define NOT_CORE_OPTION 1
+static int process_config_option(struct cache *cache, char **argv)
+{
+        unsigned long tmp;
+        if (!strcasecmp(argv[0], "migration_threshold")) {
+                if (kstrtoul(argv[1], 10, &tmp))
+                        return -EINVAL;
+                cache->migration_threshold = tmp;
+                return 0;
+        }
+        return NOT_CORE_OPTION;
+}
+/*
+ * Supports <key> <value>.
+ *
+ * The key migration_threshold is supported by the cache target core.
+ */
+static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+        int r;
+        struct cache *cache = ti->private;
+        if (argc != 2)
+                return -EINVAL;
+        r = process_config_option(cache, argv);
+        if (r == NOT_CORE_OPTION)
+                return policy_set_config_value(cache->policy, argv[0], argv[1]);
+        return r;
+}
+static int cache_iterate_devices(struct dm_target *ti,
+                                 iterate_devices_callout_fn fn, void *data)
+{
+        int r = 0;
+        struct cache *cache = ti->private;
+        r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
+        if (!r)
+                r = fn(ti, cache->origin_dev, 0, ti->len, data);
+        return r;
+}
+/*
+ * We assume I/O is going to the origin (which is the volume
+ * more likely to have restrictions e.g. by being striped).
+ * (Looking up the exact location of the data would be expensive
+ * and could always be out of date by the time the bio is submitted.)
+ */
+static int cache_bvec_merge(struct dm_target *ti,
+                            struct bvec_merge_data *bvm,
+                            struct bio_vec *biovec, int max_size)
+{
+        struct cache *cache = ti->private;
+        struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev);
+        if (!q->merge_bvec_fn)
+                return max_size;
+        bvm->bi_bdev = cache->origin_dev->bdev;
+        return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
+{
+        /*
+         * FIXME: these limits may be incompatible with the cache device
+         */
+        limits->max_discard_sectors = cache->discard_block_size * 1024;
+        limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
+}
+static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+        struct cache *cache = ti->private;
+        blk_limits_io_min(limits, 0);
+        blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
+        set_discard_limits(cache, limits);
+}
+/*----------------------------------------------------------------*/
+static struct target_type cache_target = {
+        .name = "cache",
+        .version = {1, 0, 0},
+        .module = THIS_MODULE,
+        .ctr = cache_ctr,
+        .dtr = cache_dtr,
+        .map = cache_map,
+        .end_io = cache_end_io,
+        .postsuspend = cache_postsuspend,
+        .preresume = cache_preresume,
+        .resume = cache_resume,
+        .status = cache_status,
+        .message = cache_message,
+        .iterate_devices = cache_iterate_devices,
+        .merge = cache_bvec_merge,
+        .io_hints = cache_io_hints,
+};
+static int __init dm_cache_init(void)
+{
+        int r;
+        r = dm_register_target(&cache_target);
+        if (r) {
+                DMERR("cache target registration failed: %d", r);
+                return r;
+        }
+        migration_cache = KMEM_CACHE(dm_cache_migration, 0);
+        if (!migration_cache) {
+                dm_unregister_target(&cache_target);
+                return -ENOMEM;
+        }
+        return 0;
+}
+static void __exit dm_cache_exit(void)
+{
+        dm_unregister_target(&cache_target);
+        kmem_cache_destroy(migration_cache);
+}
+module_init(dm_cache_init);
+module_exit(dm_cache_exit);
+MODULE_DESCRIPTION(DM_NAME " cache target");
+MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 28c3ed072a79..81b513890e2b 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -613,6 +613,7 @@ int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
        return dm_bufio_write_dirty_buffers(bm->bufio);
 }
+EXPORT_SYMBOL_GPL(dm_bm_flush_and_unlock);
 void dm_bm_set_read_only(struct dm_block_manager *bm)
 {