13 files changed, 483 insertions, 149 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 9a06fe883766..95ad936e6048 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -254,16 +254,6 @@ config DM_THIN_PROVISIONING
       ---help---
         Provides thin provisioning and snapshots that share a data store.
-config DM_DEBUG_BLOCK_STACK_TRACING
-        boolean "Keep stack trace of persistent data block lock holders"
-        depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA
-        select STACKTRACE
-        ---help---
-          Enable this for messages that may help debug problems with the
-          block manager locking used by thin provisioning and caching.
-          If unsure, say N.
 config DM_CACHE
       tristate "Cache target (EXPERIMENTAL)"
       depends on BLK_DEV_DM
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 1e018e986610..0e385e40909e 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -872,7 +872,7 @@ static void mq_destroy(struct dm_cache_policy *p)
 {
        struct mq_policy *mq = to_mq_policy(p);
-        kfree(mq->table);
+        vfree(mq->table);
        epool_exit(&mq->cache_pool);
        epool_exit(&mq->pre_cache_pool);
        kfree(mq);
@@ -1245,7 +1245,7 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
        mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16);
        mq->hash_bits = ffs(mq->nr_buckets) - 1;
-        mq->table = kzalloc(sizeof(*mq->table) * mq->nr_buckets, GFP_KERNEL);
+        mq->table = vzalloc(sizeof(*mq->table) * mq->nr_buckets);
        if (!mq->table)
                goto bad_alloc_table;
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index ffd472e015ca..074b9c8e4cf0 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -289,6 +289,7 @@ struct per_bio_data {
        bool tick:1;
        unsigned req_nr:2;
        struct dm_deferred_entry *all_io_entry;
+        struct dm_hook_info hook_info;
        /*
         * writethrough fields.  These MUST remain at the end of this
@@ -297,7 +298,6 @@ struct per_bio_data {
         */
        struct cache *cache;
        dm_cblock_t cblock;
-        struct dm_hook_info hook_info;
        struct dm_bio_details bio_details;
 };
@@ -671,15 +671,16 @@ static void remap_to_cache(struct cache *cache, struct bio *bio,
                           dm_cblock_t cblock)
 {
        sector_t bi_sector = bio->bi_iter.bi_sector;
+        sector_t block = from_cblock(cblock);
        bio->bi_bdev = cache->cache_dev->bdev;
        if (!block_size_is_power_of_two(cache))
                bio->bi_iter.bi_sector =
-                        (from_cblock(cblock) * cache->sectors_per_block) +
+                        (block * cache->sectors_per_block) +
                        sector_div(bi_sector, cache->sectors_per_block);
        else
                bio->bi_iter.bi_sector =
-                        (from_cblock(cblock) << cache->sectors_per_block_shift) |
+                        (block << cache->sectors_per_block_shift) |
                        (bi_sector & (cache->sectors_per_block - 1));
 }
@@ -978,12 +979,13 @@ static void issue_copy_real(struct dm_cache_migration *mg)
        int r;
        struct dm_io_region o_region, c_region;
        struct cache *cache = mg->cache;
+        sector_t cblock = from_cblock(mg->cblock);
        o_region.bdev = cache->origin_dev->bdev;
        o_region.count = cache->sectors_per_block;
        c_region.bdev = cache->cache_dev->bdev;
-        c_region.sector = from_cblock(mg->cblock) * cache->sectors_per_block;
+        c_region.sector = cblock * cache->sectors_per_block;
        c_region.count = cache->sectors_per_block;
        if (mg->writeback || mg->demote) {
@@ -1010,13 +1012,15 @@ static void overwrite_endio(struct bio *bio, int err)
        struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
        unsigned long flags;
+        dm_unhook_bio(&pb->hook_info, bio);
        if (err)
                mg->err = true;
+        mg->requeue_holder = false;
        spin_lock_irqsave(&cache->lock, flags);
        list_add_tail(&mg->list, &cache->completed_migrations);
-        dm_unhook_bio(&pb->hook_info, bio);
-        mg->requeue_holder = false;
        spin_unlock_irqrestore(&cache->lock, flags);
        wake_worker(cache);
@@ -2461,20 +2465,18 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
        bool discarded_block;
        struct dm_bio_prison_cell *cell;
        struct policy_result lookup_result;
-        struct per_bio_data *pb;
+        struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
-        if (from_oblock(block) > from_oblock(cache->origin_blocks)) {
+        if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
                /*
                 * This can only occur if the io goes to a partial block at
                 * the end of the origin device.  We don't cache these.
                 * Just remap to the origin and carry on.
                 */
-                remap_to_origin_clear_discard(cache, bio, block);
+                remap_to_origin(cache, bio);
                return DM_MAPIO_REMAPPED;
        }
-        pb = init_per_bio_data(bio, pb_data_size);
        if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
                defer_bio(cache, bio);
                return DM_MAPIO_SUBMITTED;
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index b2b8a10e8427..3842ac738f98 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -201,29 +201,28 @@ static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offse
 /*
 * Functions for getting the pages from a bvec.
 */
-static void bio_get_page(struct dpages *dp,
+static void bio_get_page(struct dpages *dp, struct page **p,
-                  struct page **p, unsigned long *len, unsigned *offset)
+                         unsigned long *len, unsigned *offset)
 {
-        struct bio *bio = dp->context_ptr;
+        struct bio_vec *bvec = dp->context_ptr;
-        struct bio_vec bvec = bio_iovec(bio);
+        *p = bvec->bv_page;
-        *p = bvec.bv_page;
+        *len = bvec->bv_len - dp->context_u;
-        *len = bvec.bv_len;
+        *offset = bvec->bv_offset + dp->context_u;
-        *offset = bvec.bv_offset;
 }
 static void bio_next_page(struct dpages *dp)
 {
-        struct bio *bio = dp->context_ptr;
+        struct bio_vec *bvec = dp->context_ptr;
-        struct bio_vec bvec = bio_iovec(bio);
+        dp->context_ptr = bvec + 1;
+        dp->context_u = 0;
-        bio_advance(bio, bvec.bv_len);
 }
 static void bio_dp_init(struct dpages *dp, struct bio *bio)
 {
        dp->get_page = bio_get_page;
        dp->next_page = bio_next_page;
-        dp->context_ptr = bio;
+        dp->context_ptr = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+        dp->context_u = bio->bi_iter.bi_bvec_done;
 }
 /*
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 6eb9dc9ef8f3..422a9fdeb53e 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1626,8 +1626,11 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
        /*
         * Only pass ioctls through if the device sizes match exactly.
         */
-        if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT)
+        if (!bdev || ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) {
-                r = scsi_verify_blk_ioctl(NULL, cmd);
+                int err = scsi_verify_blk_ioctl(NULL, cmd);
+                if (err)
+                        r = err;
+        }
        if (r == -ENOTCONN && !fatal_signal_pending(current))
                queue_work(kmultipathd, &m->process_queued_ios);
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index f284e0bfb25f..7dfdb5c746d6 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1244,6 +1244,9 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
                        dm_bio_restore(bd, bio);
                        bio_record->details.bi_bdev = NULL;
+                        atomic_inc(&bio->bi_remaining);
                        queue_bio(ms, bio, rw);
                        return DM_ENDIO_INCOMPLETE;
                }
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index afc3d017de4c..d6e88178d22c 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -546,6 +546,9 @@ static int read_exceptions(struct pstore *ps,
                r = insert_exceptions(ps, area, callback, callback_context,
                                      &full);
+                if (!full)
+                        memcpy(ps->area, area, ps->store->chunk_size << SECTOR_SHIFT);
                dm_bufio_release(bp);
                dm_bufio_forget(client, chunk);
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 7da347665552..fb9efc829182 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -76,7 +76,7 @@
 #define THIN_SUPERBLOCK_MAGIC 27022010
 #define THIN_SUPERBLOCK_LOCATION 0
-#define THIN_VERSION 1
+#define THIN_VERSION 2
 #define THIN_METADATA_CACHE_SIZE 64
 #define SECTOR_TO_BLOCK_SHIFT 3
@@ -483,7 +483,7 @@ static int __write_initial_superblock(struct dm_pool_metadata *pmd)
        disk_super->data_mapping_root = cpu_to_le64(pmd->root);
        disk_super->device_details_root = cpu_to_le64(pmd->details_root);
-        disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
+        disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE);
        disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT);
        disk_super->data_block_size = cpu_to_le32(pmd->data_block_size);
@@ -651,7 +651,7 @@ static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool f
 {
        int r;
-        pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE,
+        pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
                                          THIN_METADATA_CACHE_SIZE,
                                          THIN_MAX_CONCURRENT_LOCKS);
        if (IS_ERR(pmd->bm)) {
@@ -1489,6 +1489,23 @@ bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
        return r;
 }
+bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd)
+{
+        bool r = false;
+        struct dm_thin_device *td, *tmp;
+        down_read(&pmd->root_lock);
+        list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
+                if (td->changed) {
+                        r = td->changed;
+                        break;
+                }
+        }
+        up_read(&pmd->root_lock);
+        return r;
+}
 bool dm_thin_aborted_changes(struct dm_thin_device *td)
 {
        bool r;
@@ -1738,3 +1755,38 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
        return r;
 }
+int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
+{
+        int r;
+        struct dm_block *sblock;
+        struct thin_disk_superblock *disk_super;
+        down_write(&pmd->root_lock);
+        pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG;
+        r = superblock_lock(pmd, &sblock);
+        if (r) {
+                DMERR("couldn't read superblock");
+                goto out;
+        }
+        disk_super = dm_block_data(sblock);
+        disk_super->flags = cpu_to_le32(pmd->flags);
+        dm_bm_unlock(sblock);
+out:
+        up_write(&pmd->root_lock);
+        return r;
+}
+bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd)
+{
+        bool needs_check;
+        down_read(&pmd->root_lock);
+        needs_check = pmd->flags & THIN_METADATA_NEEDS_CHECK_FLAG;
+        up_read(&pmd->root_lock);
+        return needs_check;
+}
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index 9a368567632f..e3c857db195a 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -9,16 +9,14 @@
 #include "persistent-data/dm-block-manager.h"
 #include "persistent-data/dm-space-map.h"
+#include "persistent-data/dm-space-map-metadata.h"
-#define THIN_METADATA_BLOCK_SIZE 4096
+#define THIN_METADATA_BLOCK_SIZE DM_SM_METADATA_BLOCK_SIZE
 /*
 * The metadata device is currently limited in size.
- *
- * We have one block of index, which can hold 255 index entries.  Each
- * index entry contains allocation info about 16k metadata blocks.
 */
-#define THIN_METADATA_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
+#define THIN_METADATA_MAX_SECTORS DM_SM_METADATA_MAX_SECTORS
 /*
 * A metadata device larger than 16GB triggers a warning.
@@ -27,6 +25,11 @@
 /*----------------------------------------------------------------*/
+/*
+ * Thin metadata superblock flags.
+ */
+#define THIN_METADATA_NEEDS_CHECK_FLAG (1 << 0)
 struct dm_pool_metadata;
 struct dm_thin_device;
@@ -161,6 +164,8 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block);
 */
 bool dm_thin_changed_this_transaction(struct dm_thin_device *td);
+bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd);
 bool dm_thin_aborted_changes(struct dm_thin_device *td);
 int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
@@ -202,6 +207,12 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
                                        dm_sm_threshold_fn fn,
                                        void *context);
+/*
+ * Updates the superblock immediately.
+ */
+int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd);
+bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd);
 /*----------------------------------------------------------------*/
 #endif
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index faaf944597ab..be70d38745f7 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -130,10 +130,11 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
 struct dm_thin_new_mapping;
 /*
- * The pool runs in 3 modes.  Ordered in degraded order for comparisons.
+ * The pool runs in 4 modes.  Ordered in degraded order for comparisons.
 */
 enum pool_mode {
        PM_WRITE,               /* metadata may be changed */
+        PM_OUT_OF_DATA_SPACE,   /* metadata may be changed, though data may not be allocated */
        PM_READ_ONLY,           /* metadata may not be changed */
        PM_FAIL,                /* all I/O fails */
 };
@@ -198,7 +199,6 @@ struct pool {
 };
 static enum pool_mode get_pool_mode(struct pool *pool);
-static void out_of_data_space(struct pool *pool);
 static void metadata_operation_failed(struct pool *pool, const char *op, int r);
 /*
@@ -226,6 +226,7 @@ struct thin_c {
        struct pool *pool;
        struct dm_thin_device *td;
+        bool requeue_mode:1;
 };
 /*----------------------------------------------------------------*/
@@ -369,14 +370,18 @@ struct dm_thin_endio_hook {
        struct dm_thin_new_mapping *overwrite_mapping;
 };
-static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
+static void requeue_bio_list(struct thin_c *tc, struct bio_list *master)
 {
        struct bio *bio;
        struct bio_list bios;
+        unsigned long flags;
        bio_list_init(&bios);
+        spin_lock_irqsave(&tc->pool->lock, flags);
        bio_list_merge(&bios, master);
        bio_list_init(master);
+        spin_unlock_irqrestore(&tc->pool->lock, flags);
        while ((bio = bio_list_pop(&bios))) {
                struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
@@ -391,12 +396,26 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
 static void requeue_io(struct thin_c *tc)
 {
        struct pool *pool = tc->pool;
+        requeue_bio_list(tc, &pool->deferred_bios);
+        requeue_bio_list(tc, &pool->retry_on_resume_list);
+}
+static void error_retry_list(struct pool *pool)
+{
+        struct bio *bio;
        unsigned long flags;
+        struct bio_list bios;
+        bio_list_init(&bios);
        spin_lock_irqsave(&pool->lock, flags);
-        __requeue_bio_list(tc, &pool->deferred_bios);
+        bio_list_merge(&bios, &pool->retry_on_resume_list);
-        __requeue_bio_list(tc, &pool->retry_on_resume_list);
+        bio_list_init(&pool->retry_on_resume_list);
        spin_unlock_irqrestore(&pool->lock, flags);
+        while ((bio = bio_list_pop(&bios)))
+                bio_io_error(bio);
 }
 /*
@@ -925,13 +944,15 @@ static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
        }
 }
+static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
 static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 {
        int r;
        dm_block_t free_blocks;
        struct pool *pool = tc->pool;
-        if (get_pool_mode(pool) != PM_WRITE)
+        if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
                return -EINVAL;
        r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
@@ -958,7 +979,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
                }
                if (!free_blocks) {
-                        out_of_data_space(pool);
+                        set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
                        return -ENOSPC;
                }
        }
@@ -988,15 +1009,32 @@ static void retry_on_resume(struct bio *bio)
        spin_unlock_irqrestore(&pool->lock, flags);
 }
-static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
+static bool should_error_unserviceable_bio(struct pool *pool)
 {
-        /*
+        enum pool_mode m = get_pool_mode(pool);
-         * When pool is read-only, no cell locking is needed because
-         * nothing is changing.
-         */
-        WARN_ON_ONCE(get_pool_mode(pool) != PM_READ_ONLY);
-        if (pool->pf.error_if_no_space)
+        switch (m) {
+        case PM_WRITE:
+                /* Shouldn't get here */
+                DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
+                return true;
+        case PM_OUT_OF_DATA_SPACE:
+                return pool->pf.error_if_no_space;
+        case PM_READ_ONLY:
+        case PM_FAIL:
+                return true;
+        default:
+                /* Shouldn't get here */
+                DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
+                return true;
+        }
+}
+static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
+{
+        if (should_error_unserviceable_bio(pool))
                bio_io_error(bio);
        else
                retry_on_resume(bio);
@@ -1007,11 +1045,20 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
        struct bio *bio;
        struct bio_list bios;
+        if (should_error_unserviceable_bio(pool)) {
+                cell_error(pool, cell);
+                return;
+        }
        bio_list_init(&bios);
        cell_release(pool, cell, &bios);
-        while ((bio = bio_list_pop(&bios)))
+        if (should_error_unserviceable_bio(pool))
-                handle_unserviceable_bio(pool, bio);
+                while ((bio = bio_list_pop(&bios)))
+                        bio_io_error(bio);
+        else
+                while ((bio = bio_list_pop(&bios)))
+                        retry_on_resume(bio);
 }
 static void process_discard(struct thin_c *tc, struct bio *bio)
@@ -1296,6 +1343,11 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
        }
 }
+static void process_bio_success(struct thin_c *tc, struct bio *bio)
+{
+        bio_endio(bio, 0);
+}
 static void process_bio_fail(struct thin_c *tc, struct bio *bio)
 {
        bio_io_error(bio);
@@ -1328,6 +1380,11 @@ static void process_deferred_bios(struct pool *pool)
                struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
                struct thin_c *tc = h->tc;
+                if (tc->requeue_mode) {
+                        bio_endio(bio, DM_ENDIO_REQUEUE);
+                        continue;
+                }
                /*
                 * If we've got no free new_mapping structs, and processing
                 * this bio might require one, we pause until there are some
@@ -1357,7 +1414,8 @@ static void process_deferred_bios(struct pool *pool)
        bio_list_init(&pool->deferred_flush_bios);
        spin_unlock_irqrestore(&pool->lock, flags);
-        if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
+        if (bio_list_empty(&bios) &&
+            !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
                return;
        if (commit(pool)) {
@@ -1393,51 +1451,134 @@ static void do_waker(struct work_struct *ws)
 /*----------------------------------------------------------------*/
+struct noflush_work {
+        struct work_struct worker;
+        struct thin_c *tc;
+        atomic_t complete;
+        wait_queue_head_t wait;
+};
+static void complete_noflush_work(struct noflush_work *w)
+{
+        atomic_set(&w->complete, 1);
+        wake_up(&w->wait);
+}
+static void do_noflush_start(struct work_struct *ws)
+{
+        struct noflush_work *w = container_of(ws, struct noflush_work, worker);
+        w->tc->requeue_mode = true;
+        requeue_io(w->tc);
+        complete_noflush_work(w);
+}
+static void do_noflush_stop(struct work_struct *ws)
+{
+        struct noflush_work *w = container_of(ws, struct noflush_work, worker);
+        w->tc->requeue_mode = false;
+        complete_noflush_work(w);
+}
+static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))
+{
+        struct noflush_work w;
+        INIT_WORK(&w.worker, fn);
+        w.tc = tc;
+        atomic_set(&w.complete, 0);
+        init_waitqueue_head(&w.wait);
+        queue_work(tc->pool->wq, &w.worker);
+        wait_event(w.wait, atomic_read(&w.complete));
+}
+/*----------------------------------------------------------------*/
 static enum pool_mode get_pool_mode(struct pool *pool)
 {
        return pool->pf.mode;
 }
+static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
+{
+        dm_table_event(pool->ti->table);
+        DMINFO("%s: switching pool to %s mode",
+               dm_device_name(pool->pool_md), new_mode);
+}
 static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 {
-        int r;
+        struct pool_c *pt = pool->ti->private;
-        enum pool_mode old_mode = pool->pf.mode;
+        bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
+        enum pool_mode old_mode = get_pool_mode(pool);
+        /*
+         * Never allow the pool to transition to PM_WRITE mode if user
+         * intervention is required to verify metadata and data consistency.
+         */
+        if (new_mode == PM_WRITE && needs_check) {
+                DMERR("%s: unable to switch pool to write mode until repaired.",
+                      dm_device_name(pool->pool_md));
+                if (old_mode != new_mode)
+                        new_mode = old_mode;
+                else
+                        new_mode = PM_READ_ONLY;
+        }
+        /*
+         * If we were in PM_FAIL mode, rollback of metadata failed.  We're
+         * not going to recover without a thin_repair.  So we never let the
+         * pool move out of the old mode.
+         */
+        if (old_mode == PM_FAIL)
+                new_mode = old_mode;
        switch (new_mode) {
        case PM_FAIL:
                if (old_mode != new_mode)
-                        DMERR("%s: switching pool to failure mode",
+                        notify_of_pool_mode_change(pool, "failure");
-                              dm_device_name(pool->pool_md));
                dm_pool_metadata_read_only(pool->pmd);
                pool->process_bio = process_bio_fail;
                pool->process_discard = process_bio_fail;
                pool->process_prepared_mapping = process_prepared_mapping_fail;
                pool->process_prepared_discard = process_prepared_discard_fail;
+                error_retry_list(pool);
                break;
        case PM_READ_ONLY:
                if (old_mode != new_mode)
-                        DMERR("%s: switching pool to read-only mode",
+                        notify_of_pool_mode_change(pool, "read-only");
-                              dm_device_name(pool->pool_md));
+                dm_pool_metadata_read_only(pool->pmd);
-                r = dm_pool_abort_metadata(pool->pmd);
+                pool->process_bio = process_bio_read_only;
-                if (r) {
+                pool->process_discard = process_bio_success;
-                        DMERR("%s: aborting transaction failed",
+                pool->process_prepared_mapping = process_prepared_mapping_fail;
-                              dm_device_name(pool->pool_md));
+                pool->process_prepared_discard = process_prepared_discard_passdown;
-                        new_mode = PM_FAIL;
-                        set_pool_mode(pool, new_mode);
+                error_retry_list(pool);
-                } else {
+                break;
-                        dm_pool_metadata_read_only(pool->pmd);
-                        pool->process_bio = process_bio_read_only;
+        case PM_OUT_OF_DATA_SPACE:
-                        pool->process_discard = process_discard;
+                /*
-                        pool->process_prepared_mapping = process_prepared_mapping_fail;
+                 * Ideally we'd never hit this state; the low water mark
-                        pool->process_prepared_discard = process_prepared_discard_passdown;
+                 * would trigger userland to extend the pool before we
-                }
+                 * completely run out of data space.  However, many small
+                 * IOs to unprovisioned space can consume data space at an
+                 * alarming rate.  Adjust your low water mark if you're
+                 * frequently seeing this mode.
+                 */
+                if (old_mode != new_mode)
+                        notify_of_pool_mode_change(pool, "out-of-data-space");
+                pool->process_bio = process_bio_read_only;
+                pool->process_discard = process_discard;
+                pool->process_prepared_mapping = process_prepared_mapping;
+                pool->process_prepared_discard = process_prepared_discard_passdown;
                break;
        case PM_WRITE:
                if (old_mode != new_mode)
-                        DMINFO("%s: switching pool to write mode",
+                        notify_of_pool_mode_change(pool, "write");
-                               dm_device_name(pool->pool_md));
                dm_pool_metadata_read_write(pool->pmd);
                pool->process_bio = process_bio;
                pool->process_discard = process_discard;
@@ -1447,32 +1588,35 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
        }
        pool->pf.mode = new_mode;
+        /*
+         * The pool mode may have changed, sync it so bind_control_target()
+         * doesn't cause an unexpected mode transition on resume.
+         */
+        pt->adjusted_pf.mode = new_mode;
 }
-/*
+static void abort_transaction(struct pool *pool)
- * Rather than calling set_pool_mode directly, use these which describe the
- * reason for mode degradation.
- */
-static void out_of_data_space(struct pool *pool)
 {
-        DMERR_LIMIT("%s: no free data space available.",
+        const char *dev_name = dm_device_name(pool->pool_md);
-                    dm_device_name(pool->pool_md));
-        set_pool_mode(pool, PM_READ_ONLY);
+        DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
+        if (dm_pool_abort_metadata(pool->pmd)) {
+                DMERR("%s: failed to abort metadata transaction", dev_name);
+                set_pool_mode(pool, PM_FAIL);
+        }
+        if (dm_pool_metadata_set_needs_check(pool->pmd)) {
+                DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
+                set_pool_mode(pool, PM_FAIL);
+        }
 }
 static void metadata_operation_failed(struct pool *pool, const char *op, int r)
 {
-        dm_block_t free_blocks;
        DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
                    dm_device_name(pool->pool_md), op, r);
-        if (r == -ENOSPC &&
+        abort_transaction(pool);
-            !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) &&
-            !free_blocks)
-                DMERR_LIMIT("%s: no free metadata space available.",
-                            dm_device_name(pool->pool_md));
        set_pool_mode(pool, PM_READ_ONLY);
 }
@@ -1523,6 +1667,11 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
        thin_hook_bio(tc, bio);
+        if (tc->requeue_mode) {
+                bio_endio(bio, DM_ENDIO_REQUEUE);
+                return DM_MAPIO_SUBMITTED;
+        }
        if (get_pool_mode(tc->pool) == PM_FAIL) {
                bio_io_error(bio);
                return DM_MAPIO_SUBMITTED;
@@ -1686,7 +1835,7 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
        /*
         * We want to make sure that a pool in PM_FAIL mode is never upgraded.
         */
-        enum pool_mode old_mode = pool->pf.mode;
+        enum pool_mode old_mode = get_pool_mode(pool);
        enum pool_mode new_mode = pt->adjusted_pf.mode;
        /*
@@ -1700,16 +1849,6 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
        pool->pf = pt->adjusted_pf;
        pool->low_water_blocks = pt->low_water_blocks;
-        /*
-         * If we were in PM_FAIL mode, rollback of metadata failed.  We're
-         * not going to recover without a thin_repair.  So we never let the
-         * pool move out of the old mode.  On the other hand a PM_READ_ONLY
-         * may have been due to a lack of metadata or data space, and may
-         * now work (ie. if the underlying devices have been resized).
-         */
-        if (old_mode == PM_FAIL)
-                new_mode = old_mode;
        set_pool_mode(pool, new_mode);
        return 0;
@@ -1999,16 +2138,27 @@ static void metadata_low_callback(void *context)
        dm_table_event(pool->ti->table);
 }
-static sector_t get_metadata_dev_size(struct block_device *bdev)
+static sector_t get_dev_size(struct block_device *bdev)
+{
+        return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
+}
+static void warn_if_metadata_device_too_big(struct block_device *bdev)
 {
-        sector_t metadata_dev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
+        sector_t metadata_dev_size = get_dev_size(bdev);
        char buffer[BDEVNAME_SIZE];
-        if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) {
+        if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
                DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
                       bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS);
-                metadata_dev_size = THIN_METADATA_MAX_SECTORS_WARNING;
+}
-        }
+static sector_t get_metadata_dev_size(struct block_device *bdev)
+{
+        sector_t metadata_dev_size = get_dev_size(bdev);
+        if (metadata_dev_size > THIN_METADATA_MAX_SECTORS)
+                metadata_dev_size = THIN_METADATA_MAX_SECTORS;
        return metadata_dev_size;
 }
@@ -2017,7 +2167,7 @@ static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev)
 {
        sector_t metadata_dev_size = get_metadata_dev_size(bdev);
-        sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
+        sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE);
        return metadata_dev_size;
 }
@@ -2095,12 +2245,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
                ti->error = "Error opening metadata block device";
                goto out_unlock;
        }
+        warn_if_metadata_device_too_big(metadata_dev->bdev);
-        /*
-         * Run for the side-effect of possibly issuing a warning if the
-         * device is too big.
-         */
-        (void) get_metadata_dev_size(metadata_dev->bdev);
        r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
        if (r) {
@@ -2246,6 +2391,12 @@ static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
                return -EINVAL;
        } else if (data_size > sb_data_size) {
+                if (dm_pool_metadata_needs_check(pool->pmd)) {
+                        DMERR("%s: unable to grow the data device until repaired.",
+                              dm_device_name(pool->pool_md));
+                        return 0;
+                }
                if (sb_data_size)
                        DMINFO("%s: growing the data device from %llu to %llu blocks",
                               dm_device_name(pool->pool_md),
@@ -2287,6 +2438,13 @@ static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
                return -EINVAL;
        } else if (metadata_dev_size > sb_metadata_dev_size) {
+                if (dm_pool_metadata_needs_check(pool->pmd)) {
+                        DMERR("%s: unable to grow the metadata device until repaired.",
+                              dm_device_name(pool->pool_md));
+                        return 0;
+                }
+                warn_if_metadata_device_too_big(pool->md_dev);
                DMINFO("%s: growing the metadata device from %llu to %llu blocks",
                       dm_device_name(pool->pool_md),
                       sb_metadata_dev_size, metadata_dev_size);
@@ -2673,7 +2831,9 @@ static void pool_status(struct dm_target *ti, status_type_t type,
                else
                        DMEMIT("- ");
-                if (pool->pf.mode == PM_READ_ONLY)
+                if (pool->pf.mode == PM_OUT_OF_DATA_SPACE)
+                        DMEMIT("out_of_data_space ");
+                else if (pool->pf.mode == PM_READ_ONLY)
                        DMEMIT("ro ");
                else
                        DMEMIT("rw ");
@@ -2787,7 +2947,7 @@ static struct target_type pool_target = {
        .name = "thin-pool",
        .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
                    DM_TARGET_IMMUTABLE,
-        .version = {1, 10, 0},
+        .version = {1, 11, 0},
        .module = THIS_MODULE,
        .ctr = pool_ctr,
        .dtr = pool_dtr,
@@ -2894,6 +3054,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
        if (get_pool_mode(tc->pool) == PM_FAIL) {
                ti->error = "Couldn't open thin device, Pool is in fail mode";
+                r = -EINVAL;
                goto bad_thin_open;
        }
@@ -2905,7 +3066,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
        r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
        if (r)
-                goto bad_thin_open;
+                goto bad_target_max_io_len;
        ti->num_flush_bios = 1;
        ti->flush_supported = true;
@@ -2926,6 +3087,8 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
        return 0;
+bad_target_max_io_len:
+        dm_pool_close_thin_device(tc->td);
 bad_thin_open:
        __pool_dec(tc->pool);
 bad_pool_lookup:
@@ -2986,10 +3149,23 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
        return 0;
 }
-static void thin_postsuspend(struct dm_target *ti)
+static void thin_presuspend(struct dm_target *ti)
 {
+        struct thin_c *tc = ti->private;
        if (dm_noflush_suspending(ti))
-                requeue_io((struct thin_c *)ti->private);
+                noflush_work(tc, do_noflush_start);
+}
+static void thin_postsuspend(struct dm_target *ti)
+{
+        struct thin_c *tc = ti->private;
+        /*
+         * The dm_noflush_suspending flag has been cleared by now, so
+         * unfortunately we must always run this.
+         */
+        noflush_work(tc, do_noflush_stop);
 }
 /*
@@ -3074,12 +3250,13 @@ static int thin_iterate_devices(struct dm_target *ti,
 static struct target_type thin_target = {
        .name = "thin",
-        .version = {1, 10, 0},
+        .version = {1, 11, 0},
        .module = THIS_MODULE,
        .ctr = thin_ctr,
        .dtr = thin_dtr,
        .map = thin_map,
        .end_io = thin_endio,
+        .presuspend = thin_presuspend,
        .postsuspend = thin_postsuspend,
        .status = thin_status,
        .iterate_devices = thin_iterate_devices,
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig
index 19b268795415..0c2dec7aec20 100644
--- a/drivers/md/persistent-data/Kconfig
+++ b/drivers/md/persistent-data/Kconfig
@@ -6,3 +6,13 @@ config DM_PERSISTENT_DATA
       ---help---
         Library providing immutable on-disk data structure support for
         device-mapper targets such as the thin provisioning target.
+config DM_DEBUG_BLOCK_STACK_TRACING
+       boolean "Keep stack trace of persistent data block lock holders"
+       depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA
+       select STACKTRACE
+       ---help---
+         Enable this for messages that may help debug problems with the
+         block manager locking used by thin provisioning and caching.
+         If unsure, say N.
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 536782e3bcb7..786b689bdfc7 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -91,6 +91,69 @@ struct block_op {
        dm_block_t block;
 };
+struct bop_ring_buffer {
+        unsigned begin;
+        unsigned end;
+        struct block_op bops[MAX_RECURSIVE_ALLOCATIONS + 1];
+};
+static void brb_init(struct bop_ring_buffer *brb)
+{
+        brb->begin = 0;
+        brb->end = 0;
+}
+static bool brb_empty(struct bop_ring_buffer *brb)
+{
+        return brb->begin == brb->end;
+}
+static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old)
+{
+        unsigned r = old + 1;
+        return (r >= (sizeof(brb->bops) / sizeof(*brb->bops))) ? 0 : r;
+}
+static int brb_push(struct bop_ring_buffer *brb,
+                    enum block_op_type type, dm_block_t b)
+{
+        struct block_op *bop;
+        unsigned next = brb_next(brb, brb->end);
+        /*
+         * We don't allow the last bop to be filled, this way we can
+         * differentiate between full and empty.
+         */
+        if (next == brb->begin)
+                return -ENOMEM;
+        bop = brb->bops + brb->end;
+        bop->type = type;
+        bop->block = b;
+        brb->end = next;
+        return 0;
+}
+static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result)
+{
+        struct block_op *bop;
+        if (brb_empty(brb))
+                return -ENODATA;
+        bop = brb->bops + brb->begin;
+        result->type = bop->type;
+        result->block = bop->block;
+        brb->begin = brb_next(brb, brb->begin);
+        return 0;
+}
+/*----------------------------------------------------------------*/
 struct sm_metadata {
        struct dm_space_map sm;
@@ -101,25 +164,20 @@ struct sm_metadata {
        unsigned recursion_count;
        unsigned allocated_this_transaction;
-        unsigned nr_uncommitted;
+        struct bop_ring_buffer uncommitted;
-        struct block_op uncommitted[MAX_RECURSIVE_ALLOCATIONS];
        struct threshold threshold;
 };
 static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b)
 {
-        struct block_op *op;
+        int r = brb_push(&smm->uncommitted, type, b);
-        if (smm->nr_uncommitted == MAX_RECURSIVE_ALLOCATIONS) {
+        if (r) {
                DMERR("too many recursive allocations");
                return -ENOMEM;
        }
-        op = smm->uncommitted + smm->nr_uncommitted++;
-        op->type = type;
-        op->block = b;
        return 0;
 }
@@ -158,11 +216,17 @@ static int out(struct sm_metadata *smm)
                return -ENOMEM;
        }
-        if (smm->recursion_count == 1 && smm->nr_uncommitted) {
+        if (smm->recursion_count == 1) {
-                while (smm->nr_uncommitted && !r) {
+                while (!brb_empty(&smm->uncommitted)) {
-                        smm->nr_uncommitted--;
+                        struct block_op bop;
-                        r = commit_bop(smm, smm->uncommitted +
-                                       smm->nr_uncommitted);
+                        r = brb_pop(&smm->uncommitted, &bop);
+                        if (r) {
+                                DMERR("bug in bop ring buffer");
+                                break;
+                        }
+                        r = commit_bop(smm, &bop);
                        if (r)
                                break;
                }
@@ -217,7 +281,8 @@ static int sm_metadata_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
 static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
                                 uint32_t *result)
 {
-        int r, i;
+        int r;
+        unsigned i;
        struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
        unsigned adjustment = 0;
@@ -225,8 +290,10 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
         * We may have some uncommitted adjustments to add.  This list
         * should always be really short.
         */
-        for (i = 0; i < smm->nr_uncommitted; i++) {
+        for (i = smm->uncommitted.begin;
-                struct block_op *op = smm->uncommitted + i;
+             i != smm->uncommitted.end;
+             i = brb_next(&smm->uncommitted, i)) {
+                struct block_op *op = smm->uncommitted.bops + i;
                if (op->block != b)
                        continue;
@@ -254,7 +321,8 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
 static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
                                              dm_block_t b, int *result)
 {
-        int r, i, adjustment = 0;
+        int r, adjustment = 0;
+        unsigned i;
        struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
        uint32_t rc;
@@ -262,8 +330,11 @@ static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
         * We may have some uncommitted adjustments to add.  This list
         * should always be really short.
         */
-        for (i = 0; i < smm->nr_uncommitted; i++) {
+        for (i = smm->uncommitted.begin;
-                struct block_op *op = smm->uncommitted + i;
+             i != smm->uncommitted.end;
+             i = brb_next(&smm->uncommitted, i)) {
+                struct block_op *op = smm->uncommitted.bops + i;
                if (op->block != b)
                        continue;
@@ -671,7 +742,7 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
        smm->begin = superblock + 1;
        smm->recursion_count = 0;
        smm->allocated_this_transaction = 0;
-        smm->nr_uncommitted = 0;
+        brb_init(&smm->uncommitted);
        threshold_init(&smm->threshold);
        memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm));
@@ -680,6 +751,8 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
        if (r)
                return r;
+        if (nr_blocks > DM_SM_METADATA_MAX_BLOCKS)
+                nr_blocks = DM_SM_METADATA_MAX_BLOCKS;
        r = sm_ll_extend(&smm->ll, nr_blocks);
        if (r)
                return r;
@@ -713,7 +786,7 @@ int dm_sm_metadata_open(struct dm_space_map *sm,
        smm->begin = 0;
        smm->recursion_count = 0;
        smm->allocated_this_transaction = 0;
-        smm->nr_uncommitted = 0;
+        brb_init(&smm->uncommitted);
        threshold_init(&smm->threshold);
        memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll));
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.h b/drivers/md/persistent-data/dm-space-map-metadata.h
index 39bba0801cf2..64df923974d8 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.h
+++ b/drivers/md/persistent-data/dm-space-map-metadata.h
@@ -9,6 +9,17 @@
 #include "dm-transaction-manager.h"
+#define DM_SM_METADATA_BLOCK_SIZE (4096 >> SECTOR_SHIFT)
+/*
+ * The metadata device is currently limited in size.
+ *
+ * We have one block of index, which can hold 255 index entries.  Each
+ * index entry contains allocation info about ~16k metadata blocks.
+ */
+#define DM_SM_METADATA_MAX_BLOCKS (255 * ((1 << 14) - 64))
+#define DM_SM_METADATA_MAX_SECTORS (DM_SM_METADATA_MAX_BLOCKS * DM_SM_METADATA_BLOCK_SIZE)
 /*
 * Unfortunately we have to use two-phase construction due to the cycle
 * between the tm and sm.