74 files changed, 4061 insertions, 3168 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index f2ccbc3b9fe4..95ad936e6048 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -176,8 +176,12 @@ config MD_FAULTY
 source "drivers/md/bcache/Kconfig"
+config BLK_DEV_DM_BUILTIN
+        boolean
 config BLK_DEV_DM
        tristate "Device mapper support"
+        select BLK_DEV_DM_BUILTIN
        ---help---
          Device-mapper is a low level volume manager.  It works by allowing
          people to specify mappings for ranges of logical sectors.  Various
@@ -238,6 +242,7 @@ config DM_CRYPT
 config DM_SNAPSHOT
       tristate "Snapshot target"
       depends on BLK_DEV_DM
+       select DM_BUFIO
       ---help---
         Allow volume managers to take writable snapshots of a device.
@@ -249,16 +254,6 @@ config DM_THIN_PROVISIONING
       ---help---
         Provides thin provisioning and snapshots that share a data store.
-config DM_DEBUG_BLOCK_STACK_TRACING
-        boolean "Keep stack trace of thin provisioning block lock holders"
-        depends on STACKTRACE_SUPPORT && DM_THIN_PROVISIONING
-        select STACKTRACE
-        ---help---
-          Enable this for messages that may help debug problems with the
-          block manager locking used by thin provisioning.
-          If unsure, say N.
 config DM_CACHE
       tristate "Cache target (EXPERIMENTAL)"
       depends on BLK_DEV_DM
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 2acc43fe0229..f26d83292579 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_MD_FAULTY)		+= faulty.o
 obj-$(CONFIG_BCACHE)            += bcache/
 obj-$(CONFIG_BLK_DEV_MD)        += md-mod.o
 obj-$(CONFIG_BLK_DEV_DM)        += dm-mod.o
+obj-$(CONFIG_BLK_DEV_DM_BUILTIN) += dm-builtin.o
 obj-$(CONFIG_DM_BUFIO)          += dm-bufio.o
 obj-$(CONFIG_DM_BIO_PRISON)     += dm-bio-prison.o
 obj-$(CONFIG_DM_CRYPT)          += dm-crypt.o
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
index 0e9c82523be6..c488b846f831 100644
--- a/drivers/md/bcache/Makefile
+++ b/drivers/md/bcache/Makefile
@@ -1,7 +1,8 @@
 obj-$(CONFIG_BCACHE)    += bcache.o
-bcache-y                := alloc.o btree.o bset.o io.o journal.o writeback.o\
+bcache-y                := alloc.o bset.o btree.o closure.o debug.o extents.o\
-        movinggc.o request.o super.o sysfs.o debug.o util.o trace.o stats.o closure.o
+        io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
+        util.o writeback.o
 CFLAGS_request.o        += -Iblock
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 4c9852d92b0a..c0d37d082443 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -132,10 +132,16 @@ bool bch_bucket_add_unused(struct cache *ca, struct bucket *b)
 {
        BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b));
-        if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] &&
+        if (CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) {
-            CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO)
+                unsigned i;
-                return false;
+                for (i = 0; i < RESERVE_NONE; i++)
+                        if (!fifo_full(&ca->free[i]))
+                                goto add;
+                return false;
+        }
+add:
        b->prio = 0;
        if (can_inc_bucket_gen(b) &&
@@ -162,8 +168,21 @@ static void invalidate_one_bucket(struct cache *ca, struct bucket *b)
        fifo_push(&ca->free_inc, b - ca->buckets);
 }
-#define bucket_prio(b)                          \
+/*
-        (((unsigned) (b->prio - ca->set->min_prio)) * GC_SECTORS_USED(b))
+ * Determines what order we're going to reuse buckets, smallest bucket_prio()
+ * first: we also take into account the number of sectors of live data in that
+ * bucket, and in order for that multiply to make sense we have to scale bucket
+ *
+ * Thus, we scale the bucket priorities so that the bucket with the smallest
+ * prio is worth 1/8th of what INITIAL_PRIO is worth.
+ */
+#define bucket_prio(b)                                                  \
+({                                                                      \
+        unsigned min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8;     \
+                                                                        \
+        (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b);  \
+})
 #define bucket_max_cmp(l, r)    (bucket_prio(l) < bucket_prio(r))
 #define bucket_min_cmp(l, r)    (bucket_prio(l) > bucket_prio(r))
@@ -304,6 +323,21 @@ do {									\
        __set_current_state(TASK_RUNNING);                              \
 } while (0)
+static int bch_allocator_push(struct cache *ca, long bucket)
+{
+        unsigned i;
+        /* Prios/gens are actually the most important reserve */
+        if (fifo_push(&ca->free[RESERVE_PRIO], bucket))
+                return true;
+        for (i = 0; i < RESERVE_NR; i++)
+                if (fifo_push(&ca->free[i], bucket))
+                        return true;
+        return false;
+}
 static int bch_allocator_thread(void *arg)
 {
        struct cache *ca = arg;
@@ -336,9 +370,7 @@ static int bch_allocator_thread(void *arg)
                                mutex_lock(&ca->set->bucket_lock);
                        }
-                        allocator_wait(ca, !fifo_full(&ca->free));
+                        allocator_wait(ca, bch_allocator_push(ca, bucket));
-                        fifo_push(&ca->free, bucket);
                        wake_up(&ca->set->bucket_wait);
                }
@@ -365,34 +397,29 @@ static int bch_allocator_thread(void *arg)
        }
 }
-long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait)
+long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
 {
        DEFINE_WAIT(w);
        struct bucket *b;
        long r;
        /* fastpath */
-        if (fifo_used(&ca->free) > ca->watermark[watermark]) {
+        if (fifo_pop(&ca->free[RESERVE_NONE], r) ||
-                fifo_pop(&ca->free, r);
+            fifo_pop(&ca->free[reserve], r))
                goto out;
-        }
        if (!wait)
                return -1;
-        while (1) {
+        do {
-                if (fifo_used(&ca->free) > ca->watermark[watermark]) {
-                        fifo_pop(&ca->free, r);
-                        break;
-                }
                prepare_to_wait(&ca->set->bucket_wait, &w,
                                TASK_UNINTERRUPTIBLE);
                mutex_unlock(&ca->set->bucket_lock);
                schedule();
                mutex_lock(&ca->set->bucket_lock);
-        }
+        } while (!fifo_pop(&ca->free[RESERVE_NONE], r) &&
+                 !fifo_pop(&ca->free[reserve], r));
        finish_wait(&ca->set->bucket_wait, &w);
 out:
@@ -401,12 +428,14 @@ out:
        if (expensive_debug_checks(ca->set)) {
                size_t iter;
                long i;
+                unsigned j;
                for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
                        BUG_ON(ca->prio_buckets[iter] == (uint64_t) r);
-                fifo_for_each(i, &ca->free, iter)
+                for (j = 0; j < RESERVE_NR; j++)
-                        BUG_ON(i == r);
+                        fifo_for_each(i, &ca->free[j], iter)
+                                BUG_ON(i == r);
                fifo_for_each(i, &ca->free_inc, iter)
                        BUG_ON(i == r);
                fifo_for_each(i, &ca->unused, iter)
@@ -419,7 +448,7 @@ out:
        SET_GC_SECTORS_USED(b, ca->sb.bucket_size);
-        if (watermark <= WATERMARK_METADATA) {
+        if (reserve <= RESERVE_PRIO) {
                SET_GC_MARK(b, GC_MARK_METADATA);
                SET_GC_MOVE(b, 0);
                b->prio = BTREE_PRIO;
@@ -445,7 +474,7 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k)
        }
 }
-int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
+int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
                           struct bkey *k, int n, bool wait)
 {
        int i;
@@ -459,7 +488,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
        for (i = 0; i < n; i++) {
                struct cache *ca = c->cache_by_alloc[i];
-                long b = bch_bucket_alloc(ca, watermark, wait);
+                long b = bch_bucket_alloc(ca, reserve, wait);
                if (b == -1)
                        goto err;
@@ -478,12 +507,12 @@ err:
        return -1;
 }
-int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
+int bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
                         struct bkey *k, int n, bool wait)
 {
        int ret;
        mutex_lock(&c->bucket_lock);
-        ret = __bch_bucket_alloc_set(c, watermark, k, n, wait);
+        ret = __bch_bucket_alloc_set(c, reserve, k, n, wait);
        mutex_unlock(&c->bucket_lock);
        return ret;
 }
@@ -573,8 +602,8 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors,
        while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) {
                unsigned watermark = write_prio
-                        ? WATERMARK_MOVINGGC
+                        ? RESERVE_MOVINGGC
-                        : WATERMARK_NONE;
+                        : RESERVE_NONE;
                spin_unlock(&c->data_bucket_lock);
@@ -689,7 +718,7 @@ int bch_cache_allocator_init(struct cache *ca)
         * Then 8 for btree allocations
         * Then half for the moving garbage collector
         */
+#if 0
        ca->watermark[WATERMARK_PRIO] = 0;
        ca->watermark[WATERMARK_METADATA] = prio_buckets(ca);
@@ -699,6 +728,6 @@ int bch_cache_allocator_init(struct cache *ca)
        ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
                ca->watermark[WATERMARK_MOVINGGC];
+#endif
        return 0;
 }
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 754f43177483..a4c7306ff43d 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -187,6 +187,7 @@
 #include <linux/types.h>
 #include <linux/workqueue.h>
+#include "bset.h"
 #include "util.h"
 #include "closure.h"
@@ -209,7 +210,9 @@ BITMASK(GC_MARK,	 struct bucket, gc_mark, 0, 2);
 #define GC_MARK_RECLAIMABLE     0
 #define GC_MARK_DIRTY           1
 #define GC_MARK_METADATA        2
-BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 13);
+#define GC_SECTORS_USED_SIZE    13
+#define MAX_GC_SECTORS_USED     (~(~0ULL << GC_SECTORS_USED_SIZE))
+BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, GC_SECTORS_USED_SIZE);
 BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1);
 #include "journal.h"
@@ -280,7 +283,6 @@ struct bcache_device {
        unsigned long           sectors_dirty_last;
        long                    sectors_dirty_derivative;
-        mempool_t               *unaligned_bvec;
        struct bio_set          *bio_split;
        unsigned                data_csum:1;
@@ -310,7 +312,8 @@ struct cached_dev {
        struct cache_sb         sb;
        struct bio              sb_bio;
        struct bio_vec          sb_bv[1];
-        struct closure_with_waitlist sb_write;
+        struct closure          sb_write;
+        struct semaphore        sb_write_mutex;
        /* Refcount on the cache set. Always nonzero when we're caching. */
        atomic_t                count;
@@ -383,12 +386,12 @@ struct cached_dev {
        unsigned                writeback_rate_p_term_inverse;
 };
-enum alloc_watermarks {
+enum alloc_reserve {
-        WATERMARK_PRIO,
+        RESERVE_BTREE,
-        WATERMARK_METADATA,
+        RESERVE_PRIO,
-        WATERMARK_MOVINGGC,
+        RESERVE_MOVINGGC,
-        WATERMARK_NONE,
+        RESERVE_NONE,
-        WATERMARK_MAX
+        RESERVE_NR,
 };
 struct cache {
@@ -400,8 +403,6 @@ struct cache {
        struct kobject          kobj;
        struct block_device     *bdev;
-        unsigned                watermark[WATERMARK_MAX];
        struct task_struct      *alloc_thread;
        struct closure          prio;
@@ -430,7 +431,7 @@ struct cache {
         * because all the data they contained was overwritten), so we only
         * need to discard them before they can be moved to the free list.
         */
-        DECLARE_FIFO(long, free);
+        DECLARE_FIFO(long, free)[RESERVE_NR];
        DECLARE_FIFO(long, free_inc);
        DECLARE_FIFO(long, unused);
@@ -515,7 +516,8 @@ struct cache_set {
        uint64_t                cached_dev_sectors;
        struct closure          caching;
-        struct closure_with_waitlist sb_write;
+        struct closure          sb_write;
+        struct semaphore        sb_write_mutex;
        mempool_t               *search;
        mempool_t               *bio_meta;
@@ -630,13 +632,15 @@ struct cache_set {
 #ifdef CONFIG_BCACHE_DEBUG
        struct btree            *verify_data;
+        struct bset             *verify_ondisk;
        struct mutex            verify_lock;
 #endif
        unsigned                nr_uuids;
        struct uuid_entry       *uuids;
        BKEY_PADDED(uuid_bucket);
-        struct closure_with_waitlist uuid_write;
+        struct closure          uuid_write;
+        struct semaphore        uuid_write_mutex;
        /*
         * A btree node on disk could have too many bsets for an iterator to fit
@@ -644,13 +648,7 @@ struct cache_set {
         */
        mempool_t               *fill_iter;
-        /*
+        struct bset_sort_state  sort;
-         * btree_sort() is a merge sort and requires temporary space - single
-         * element mempool
-         */
-        struct mutex            sort_lock;
-        struct bset             *sort;
-        unsigned                sort_crit_factor;
        /* List of buckets we're currently writing data to */
        struct list_head        data_buckets;
@@ -666,7 +664,6 @@ struct cache_set {
        unsigned                congested_read_threshold_us;
        unsigned                congested_write_threshold_us;
-        struct time_stats       sort_time;
        struct time_stats       btree_gc_time;
        struct time_stats       btree_split_time;
        struct time_stats       btree_read_time;
@@ -684,9 +681,9 @@ struct cache_set {
        unsigned                error_decay;
        unsigned short          journal_delay_ms;
+        bool                    expensive_debug_checks;
        unsigned                verify:1;
        unsigned                key_merging_disabled:1;
-        unsigned                expensive_debug_checks:1;
        unsigned                gc_always_rewrite:1;
        unsigned                shrinker_disabled:1;
        unsigned                copy_gc_enabled:1;
@@ -708,13 +705,8 @@ struct bbio {
        struct bio              bio;
 };
-static inline unsigned local_clock_us(void)
-{
-        return local_clock() >> 10;
-}
 #define BTREE_PRIO              USHRT_MAX
-#define INITIAL_PRIO            32768
+#define INITIAL_PRIO            32768U
 #define btree_bytes(c)          ((c)->btree_pages * PAGE_SIZE)
 #define btree_blocks(b)                                                 \
@@ -727,21 +719,6 @@ static inline unsigned local_clock_us(void)
 #define bucket_bytes(c)         ((c)->sb.bucket_size << 9)
 #define block_bytes(c)          ((c)->sb.block_size << 9)
-#define __set_bytes(i, k)       (sizeof(*(i)) + (k) * sizeof(uint64_t))
-#define set_bytes(i)            __set_bytes(i, i->keys)
-#define __set_blocks(i, k, c)   DIV_ROUND_UP(__set_bytes(i, k), block_bytes(c))
-#define set_blocks(i, c)        __set_blocks(i, (i)->keys, c)
-#define node(i, j)              ((struct bkey *) ((i)->d + (j)))
-#define end(i)                  node(i, (i)->keys)
-#define index(i, b)                                                     \
-        ((size_t) (((void *) i - (void *) (b)->sets[0].data) /          \
-                   block_bytes(b->c)))
-#define btree_data_space(b)     (PAGE_SIZE << (b)->page_order)
 #define prios_per_bucket(c)                             \
        ((bucket_bytes(c) - sizeof(struct prio_set)) /  \
         sizeof(struct bucket_disk))
@@ -784,20 +761,34 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c,
        return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr);
 }
-/* Btree key macros */
+static inline uint8_t gen_after(uint8_t a, uint8_t b)
+{
+        uint8_t r = a - b;
+        return r > 128U ? 0 : r;
+}
-static inline void bkey_init(struct bkey *k)
+static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k,
+                                unsigned i)
 {
-        *k = ZERO_KEY;
+        return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i));
 }
+static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
+                                 unsigned i)
+{
+        return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i);
+}
+/* Btree key macros */
 /*
 * This is used for various on disk data structures - cache_sb, prio_set, bset,
 * jset: The checksum is _always_ the first 8 bytes of these structs
 */
 #define csum_set(i)                                                     \
        bch_crc64(((void *) (i)) + sizeof(uint64_t),                    \
-              ((void *) end(i)) - (((void *) (i)) + sizeof(uint64_t)))
+                  ((void *) bset_bkey_last(i)) -                        \
+                  (((void *) (i)) + sizeof(uint64_t)))
 /* Error handling macros */
@@ -902,7 +893,6 @@ void bch_bbio_endio(struct cache_set *, struct bio *, int, const char *);
 void bch_bbio_free(struct bio *, struct cache_set *);
 struct bio *bch_bbio_alloc(struct cache_set *);
-struct bio *bch_bio_split(struct bio *, int, gfp_t, struct bio_set *);
 void bch_generic_make_request(struct bio *, struct bio_split_pool *);
 void __bch_submit_bbio(struct bio *, struct cache_set *);
 void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned);
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 7d388b8bb50e..3f74b4b0747b 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -5,30 +5,134 @@
 * Copyright 2012 Google, Inc.
 */
-#include "bcache.h"
+#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
-#include "btree.h"
-#include "debug.h"
+#include "util.h"
+#include "bset.h"
+#include <linux/console.h>
 #include <linux/random.h>
 #include <linux/prefetch.h>
+#ifdef CONFIG_BCACHE_DEBUG
+void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set)
+{
+        struct bkey *k, *next;
+        for (k = i->start; k < bset_bkey_last(i); k = next) {
+                next = bkey_next(k);
+                printk(KERN_ERR "block %u key %li/%u: ", set,
+                       (uint64_t *) k - i->d, i->keys);
+                if (b->ops->key_dump)
+                        b->ops->key_dump(b, k);
+                else
+                        printk("%llu:%llu\n", KEY_INODE(k), KEY_OFFSET(k));
+                if (next < bset_bkey_last(i) &&
+                    bkey_cmp(k, b->ops->is_extents ?
+                             &START_KEY(next) : next) > 0)
+                        printk(KERN_ERR "Key skipped backwards\n");
+        }
+}
+void bch_dump_bucket(struct btree_keys *b)
+{
+        unsigned i;
+        console_lock();
+        for (i = 0; i <= b->nsets; i++)
+                bch_dump_bset(b, b->set[i].data,
+                              bset_sector_offset(b, b->set[i].data));
+        console_unlock();
+}
+int __bch_count_data(struct btree_keys *b)
+{
+        unsigned ret = 0;
+        struct btree_iter iter;
+        struct bkey *k;
+        if (b->ops->is_extents)
+                for_each_key(b, k, &iter)
+                        ret += KEY_SIZE(k);
+        return ret;
+}
+void __bch_check_keys(struct btree_keys *b, const char *fmt, ...)
+{
+        va_list args;
+        struct bkey *k, *p = NULL;
+        struct btree_iter iter;
+        const char *err;
+        for_each_key(b, k, &iter) {
+                if (b->ops->is_extents) {
+                        err = "Keys out of order";
+                        if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0)
+                                goto bug;
+                        if (bch_ptr_invalid(b, k))
+                                continue;
+                        err =  "Overlapping keys";
+                        if (p && bkey_cmp(p, &START_KEY(k)) > 0)
+                                goto bug;
+                } else {
+                        if (bch_ptr_bad(b, k))
+                                continue;
+                        err = "Duplicate keys";
+                        if (p && !bkey_cmp(p, k))
+                                goto bug;
+                }
+                p = k;
+        }
+#if 0
+        err = "Key larger than btree node key";
+        if (p && bkey_cmp(p, &b->key) > 0)
+                goto bug;
+#endif
+        return;
+bug:
+        bch_dump_bucket(b);
+        va_start(args, fmt);
+        vprintk(fmt, args);
+        va_end(args);
+        panic("bch_check_keys error:  %s:\n", err);
+}
+static void bch_btree_iter_next_check(struct btree_iter *iter)
+{
+        struct bkey *k = iter->data->k, *next = bkey_next(k);
+        if (next < iter->data->end &&
+            bkey_cmp(k, iter->b->ops->is_extents ?
+                     &START_KEY(next) : next) > 0) {
+                bch_dump_bucket(iter->b);
+                panic("Key skipped backwards\n");
+        }
+}
+#else
+static inline void bch_btree_iter_next_check(struct btree_iter *iter) {}
+#endif
 /* Keylists */
-int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c)
+int __bch_keylist_realloc(struct keylist *l, unsigned u64s)
 {
        size_t oldsize = bch_keylist_nkeys(l);
-        size_t newsize = oldsize + 2 + nptrs;
+        size_t newsize = oldsize + u64s;
        uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p;
        uint64_t *new_keys;
-        /* The journalling code doesn't handle the case where the keys to insert
-         * is bigger than an empty write: If we just return -ENOMEM here,
-         * bio_insert() and bio_invalidate() will insert the keys created so far
-         * and finish the rest when the keylist is empty.
-         */
-        if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
-                return -ENOMEM;
        newsize = roundup_pow_of_two(newsize);
        if (newsize <= KEYLIST_INLINE ||
@@ -71,136 +175,6 @@ void bch_keylist_pop_front(struct keylist *l)
                bch_keylist_bytes(l));
 }
-/* Pointer validation */
-static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
-{
-        unsigned i;
-        for (i = 0; i < KEY_PTRS(k); i++)
-                if (ptr_available(c, k, i)) {
-                        struct cache *ca = PTR_CACHE(c, k, i);
-                        size_t bucket = PTR_BUCKET_NR(c, k, i);
-                        size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
-                        if (KEY_SIZE(k) + r > c->sb.bucket_size ||
-                            bucket <  ca->sb.first_bucket ||
-                            bucket >= ca->sb.nbuckets)
-                                return true;
-                }
-        return false;
-}
-bool bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k)
-{
-        char buf[80];
-        if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))
-                goto bad;
-        if (__ptr_invalid(c, k))
-                goto bad;
-        return false;
-bad:
-        bch_bkey_to_text(buf, sizeof(buf), k);
-        cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k));
-        return true;
-}
-bool bch_extent_ptr_invalid(struct cache_set *c, const struct bkey *k)
-{
-        char buf[80];
-        if (!KEY_SIZE(k))
-                return true;
-        if (KEY_SIZE(k) > KEY_OFFSET(k))
-                goto bad;
-        if (__ptr_invalid(c, k))
-                goto bad;
-        return false;
-bad:
-        bch_bkey_to_text(buf, sizeof(buf), k);
-        cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k));
-        return true;
-}
-static bool ptr_bad_expensive_checks(struct btree *b, const struct bkey *k,
-                                     unsigned ptr)
-{
-        struct bucket *g = PTR_BUCKET(b->c, k, ptr);
-        char buf[80];
-        if (mutex_trylock(&b->c->bucket_lock)) {
-                if (b->level) {
-                        if (KEY_DIRTY(k) ||
-                            g->prio != BTREE_PRIO ||
-                            (b->c->gc_mark_valid &&
-                             GC_MARK(g) != GC_MARK_METADATA))
-                                goto err;
-                } else {
-                        if (g->prio == BTREE_PRIO)
-                                goto err;
-                        if (KEY_DIRTY(k) &&
-                            b->c->gc_mark_valid &&
-                            GC_MARK(g) != GC_MARK_DIRTY)
-                                goto err;
-                }
-                mutex_unlock(&b->c->bucket_lock);
-        }
-        return false;
-err:
-        mutex_unlock(&b->c->bucket_lock);
-        bch_bkey_to_text(buf, sizeof(buf), k);
-        btree_bug(b,
-"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
-                  buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin),
-                  g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
-        return true;
-}
-bool bch_ptr_bad(struct btree *b, const struct bkey *k)
-{
-        struct bucket *g;
-        unsigned i, stale;
-        if (!bkey_cmp(k, &ZERO_KEY) ||
-            !KEY_PTRS(k) ||
-            bch_ptr_invalid(b, k))
-                return true;
-        for (i = 0; i < KEY_PTRS(k); i++) {
-                if (!ptr_available(b->c, k, i))
-                        return true;
-                g = PTR_BUCKET(b->c, k, i);
-                stale = ptr_stale(b->c, k, i);
-                btree_bug_on(stale > 96, b,
-                             "key too stale: %i, need_gc %u",
-                             stale, b->c->need_gc);
-                btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
-                             b, "stale dirty pointer");
-                if (stale)
-                        return true;
-                if (expensive_debug_checks(b->c) &&
-                    ptr_bad_expensive_checks(b, k, i))
-                        return true;
-        }
-        return false;
-}
 /* Key/pointer manipulation */
 void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src,
@@ -255,56 +229,138 @@ bool __bch_cut_back(const struct bkey *where, struct bkey *k)
        return true;
 }
-static uint64_t merge_chksums(struct bkey *l, struct bkey *r)
+/* Auxiliary search trees */
+/* 32 bits total: */
+#define BKEY_MID_BITS           3
+#define BKEY_EXPONENT_BITS      7
+#define BKEY_MANTISSA_BITS      (32 - BKEY_MID_BITS - BKEY_EXPONENT_BITS)
+#define BKEY_MANTISSA_MASK      ((1 << BKEY_MANTISSA_BITS) - 1)
+struct bkey_float {
+        unsigned        exponent:BKEY_EXPONENT_BITS;
+        unsigned        m:BKEY_MID_BITS;
+        unsigned        mantissa:BKEY_MANTISSA_BITS;
+} __packed;
+/*
+ * BSET_CACHELINE was originally intended to match the hardware cacheline size -
+ * it used to be 64, but I realized the lookup code would touch slightly less
+ * memory if it was 128.
+ *
+ * It definites the number of bytes (in struct bset) per struct bkey_float in
+ * the auxiliar search tree - when we're done searching the bset_float tree we
+ * have this many bytes left that we do a linear search over.
+ *
+ * Since (after level 5) every level of the bset_tree is on a new cacheline,
+ * we're touching one fewer cacheline in the bset tree in exchange for one more
+ * cacheline in the linear search - but the linear search might stop before it
+ * gets to the second cacheline.
+ */
+#define BSET_CACHELINE          128
+/* Space required for the btree node keys */
+static inline size_t btree_keys_bytes(struct btree_keys *b)
 {
-        return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) &
+        return PAGE_SIZE << b->page_order;
-                ~((uint64_t)1 << 63);
 }
-/* Tries to merge l and r: l should be lower than r
+static inline size_t btree_keys_cachelines(struct btree_keys *b)
- * Returns true if we were able to merge. If we did merge, l will be the merged
- * key, r will be untouched.
- */
-bool bch_bkey_try_merge(struct btree *b, struct bkey *l, struct bkey *r)
 {
-        unsigned i;
+        return btree_keys_bytes(b) / BSET_CACHELINE;
+}
-        if (key_merging_disabled(b->c))
+/* Space required for the auxiliary search trees */
-                return false;
+static inline size_t bset_tree_bytes(struct btree_keys *b)
+{
+        return btree_keys_cachelines(b) * sizeof(struct bkey_float);
+}
-        if (KEY_PTRS(l) != KEY_PTRS(r) ||
+/* Space required for the prev pointers */
-            KEY_DIRTY(l) != KEY_DIRTY(r) ||
+static inline size_t bset_prev_bytes(struct btree_keys *b)
-            bkey_cmp(l, &START_KEY(r)))
+{
-                return false;
+        return btree_keys_cachelines(b) * sizeof(uint8_t);
+}
-        for (i = 0; i < KEY_PTRS(l); i++)
+/* Memory allocation */
-                if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
-                    PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
-                        return false;
-        /* Keys with no pointers aren't restricted to one bucket and could
+void bch_btree_keys_free(struct btree_keys *b)
-         * overflow KEY_SIZE
+{
-         */
+        struct bset_tree *t = b->set;
-        if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) {
-                SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l));
-                SET_KEY_SIZE(l, USHRT_MAX);
-                bch_cut_front(l, r);
+        if (bset_prev_bytes(b) < PAGE_SIZE)
-                return false;
+                kfree(t->prev);
-        }
+        else
+                free_pages((unsigned long) t->prev,
+                           get_order(bset_prev_bytes(b)));
-        if (KEY_CSUM(l)) {
+        if (bset_tree_bytes(b) < PAGE_SIZE)
-                if (KEY_CSUM(r))
+                kfree(t->tree);
-                        l->ptr[KEY_PTRS(l)] = merge_chksums(l, r);
+        else
-                else
+                free_pages((unsigned long) t->tree,
-                        SET_KEY_CSUM(l, 0);
+                           get_order(bset_tree_bytes(b)));
-        }
-        SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r));
+        free_pages((unsigned long) t->data, b->page_order);
-        SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r));
-        return true;
+        t->prev = NULL;
+        t->tree = NULL;
+        t->data = NULL;
+}
+EXPORT_SYMBOL(bch_btree_keys_free);
+int bch_btree_keys_alloc(struct btree_keys *b, unsigned page_order, gfp_t gfp)
+{
+        struct bset_tree *t = b->set;
+        BUG_ON(t->data);
+        b->page_order = page_order;
+        t->data = (void *) __get_free_pages(gfp, b->page_order);
+        if (!t->data)
+                goto err;
+        t->tree = bset_tree_bytes(b) < PAGE_SIZE
+                ? kmalloc(bset_tree_bytes(b), gfp)
+                : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
+        if (!t->tree)
+                goto err;
+        t->prev = bset_prev_bytes(b) < PAGE_SIZE
+                ? kmalloc(bset_prev_bytes(b), gfp)
+                : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
+        if (!t->prev)
+                goto err;
+        return 0;
+err:
+        bch_btree_keys_free(b);
+        return -ENOMEM;
 }
+EXPORT_SYMBOL(bch_btree_keys_alloc);
+void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops,
+                         bool *expensive_debug_checks)
+{
+        unsigned i;
+        b->ops = ops;
+        b->expensive_debug_checks = expensive_debug_checks;
+        b->nsets = 0;
+        b->last_set_unwritten = 0;
+        /* XXX: shouldn't be needed */
+        for (i = 0; i < MAX_BSETS; i++)
+                b->set[i].size = 0;
+        /*
+         * Second loop starts at 1 because b->keys[0]->data is the memory we
+         * allocated
+         */
+        for (i = 1; i < MAX_BSETS; i++)
+                b->set[i].data = NULL;
+}
+EXPORT_SYMBOL(bch_btree_keys_init);
 /* Binary tree stuff for auxiliary search trees */
@@ -455,9 +511,11 @@ static unsigned bkey_to_cacheline(struct bset_tree *t, struct bkey *k)
        return ((void *) k - (void *) t->data) / BSET_CACHELINE;
 }
-static unsigned bkey_to_cacheline_offset(struct bkey *k)
+static unsigned bkey_to_cacheline_offset(struct bset_tree *t,
+                                         unsigned cacheline,
+                                         struct bkey *k)
 {
-        return ((size_t) k & (BSET_CACHELINE - 1)) / sizeof(uint64_t);
+        return (u64 *) k - (u64 *) cacheline_to_bkey(t, cacheline, 0);
 }
 static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j)
@@ -504,7 +562,7 @@ static void make_bfloat(struct bset_tree *t, unsigned j)
                : tree_to_prev_bkey(t, j >> ffs(j));
        struct bkey *r = is_power_of_2(j + 1)
-                ? node(t->data, t->data->keys - bkey_u64s(&t->end))
+                ? bset_bkey_idx(t->data, t->data->keys - bkey_u64s(&t->end))
                : tree_to_bkey(t, j >> (ffz(j) + 1));
        BUG_ON(m < l || m > r);
@@ -528,9 +586,9 @@ static void make_bfloat(struct bset_tree *t, unsigned j)
                f->exponent = 127;
 }
-static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
+static void bset_alloc_tree(struct btree_keys *b, struct bset_tree *t)
 {
-        if (t != b->sets) {
+        if (t != b->set) {
                unsigned j = roundup(t[-1].size,
                                     64 / sizeof(struct bkey_float));
@@ -538,33 +596,54 @@ static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
                t->prev = t[-1].prev + j;
        }
-        while (t < b->sets + MAX_BSETS)
+        while (t < b->set + MAX_BSETS)
                t++->size = 0;
 }
-static void bset_build_unwritten_tree(struct btree *b)
+static void bch_bset_build_unwritten_tree(struct btree_keys *b)
 {
-        struct bset_tree *t = b->sets + b->nsets;
+        struct bset_tree *t = bset_tree_last(b);
+        BUG_ON(b->last_set_unwritten);
+        b->last_set_unwritten = 1;
        bset_alloc_tree(b, t);
-        if (t->tree != b->sets->tree + bset_tree_space(b)) {
+        if (t->tree != b->set->tree + btree_keys_cachelines(b)) {
-                t->prev[0] = bkey_to_cacheline_offset(t->data->start);
+                t->prev[0] = bkey_to_cacheline_offset(t, 0, t->data->start);
                t->size = 1;
        }
 }
-static void bset_build_written_tree(struct btree *b)
+void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic)
+{
+        if (i != b->set->data) {
+                b->set[++b->nsets].data = i;
+                i->seq = b->set->data->seq;
+        } else
+                get_random_bytes(&i->seq, sizeof(uint64_t));
+        i->magic        = magic;
+        i->version      = 0;
+        i->keys         = 0;
+        bch_bset_build_unwritten_tree(b);
+}
+EXPORT_SYMBOL(bch_bset_init_next);
+void bch_bset_build_written_tree(struct btree_keys *b)
 {
-        struct bset_tree *t = b->sets + b->nsets;
+        struct bset_tree *t = bset_tree_last(b);
-        struct bkey *k = t->data->start;
+        struct bkey *prev = NULL, *k = t->data->start;
        unsigned j, cacheline = 1;
+        b->last_set_unwritten = 0;
        bset_alloc_tree(b, t);
        t->size = min_t(unsigned,
-                        bkey_to_cacheline(t, end(t->data)),
+                        bkey_to_cacheline(t, bset_bkey_last(t->data)),
-                        b->sets->tree + bset_tree_space(b) - t->tree);
+                        b->set->tree + btree_keys_cachelines(b) - t->tree);
        if (t->size < 2) {
                t->size = 0;
@@ -577,16 +656,14 @@ static void bset_build_written_tree(struct btree *b)
        for (j = inorder_next(0, t->size);
             j;
             j = inorder_next(j, t->size)) {
-                while (bkey_to_cacheline(t, k) != cacheline)
+                while (bkey_to_cacheline(t, k) < cacheline)
-                        k = bkey_next(k);
+                        prev = k, k = bkey_next(k);
-                t->prev[j] = bkey_u64s(k);
+                t->prev[j] = bkey_u64s(prev);
-                k = bkey_next(k);
+                t->tree[j].m = bkey_to_cacheline_offset(t, cacheline++, k);
-                cacheline++;
-                t->tree[j].m = bkey_to_cacheline_offset(k);
        }
-        while (bkey_next(k) != end(t->data))
+        while (bkey_next(k) != bset_bkey_last(t->data))
                k = bkey_next(k);
        t->end = *k;
@@ -597,14 +674,17 @@ static void bset_build_written_tree(struct btree *b)
             j = inorder_next(j, t->size))
                make_bfloat(t, j);
 }
+EXPORT_SYMBOL(bch_bset_build_written_tree);
-void bch_bset_fix_invalidated_key(struct btree *b, struct bkey *k)
+/* Insert */
+void bch_bset_fix_invalidated_key(struct btree_keys *b, struct bkey *k)
 {
        struct bset_tree *t;
        unsigned inorder, j = 1;
-        for (t = b->sets; t <= &b->sets[b->nsets]; t++)
+        for (t = b->set; t <= bset_tree_last(b); t++)
-                if (k < end(t->data))
+                if (k < bset_bkey_last(t->data))
                        goto found_set;
        BUG();
@@ -617,7 +697,7 @@ found_set:
        if (k == t->data->start)
                goto fix_left;
-        if (bkey_next(k) == end(t->data)) {
+        if (bkey_next(k) == bset_bkey_last(t->data)) {
                t->end = *k;
                goto fix_right;
        }
@@ -642,10 +722,12 @@ fix_right:	do {
                        j = j * 2 + 1;
                } while (j < t->size);
 }
+EXPORT_SYMBOL(bch_bset_fix_invalidated_key);
-void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k)
+static void bch_bset_fix_lookup_table(struct btree_keys *b,
+                                      struct bset_tree *t,
+                                      struct bkey *k)
 {
-        struct bset_tree *t = &b->sets[b->nsets];
        unsigned shift = bkey_u64s(k);
        unsigned j = bkey_to_cacheline(t, k);
@@ -657,8 +739,8 @@ void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k)
         * lookup table for the first key that is strictly greater than k:
         * it's either k's cacheline or the next one
         */
-        if (j < t->size &&
+        while (j < t->size &&
-            table_to_bkey(t, j) <= k)
+               table_to_bkey(t, j) <= k)
                j++;
        /* Adjust all the lookup table entries, and find a new key for any that
@@ -673,54 +755,124 @@ void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k)
                        while (k < cacheline_to_bkey(t, j, 0))
                                k = bkey_next(k);
-                        t->prev[j] = bkey_to_cacheline_offset(k);
+                        t->prev[j] = bkey_to_cacheline_offset(t, j, k);
                }
        }
-        if (t->size == b->sets->tree + bset_tree_space(b) - t->tree)
+        if (t->size == b->set->tree + btree_keys_cachelines(b) - t->tree)
                return;
        /* Possibly add a new entry to the end of the lookup table */
        for (k = table_to_bkey(t, t->size - 1);
-             k != end(t->data);
+             k != bset_bkey_last(t->data);
             k = bkey_next(k))
                if (t->size == bkey_to_cacheline(t, k)) {
-                        t->prev[t->size] = bkey_to_cacheline_offset(k);
+                        t->prev[t->size] = bkey_to_cacheline_offset(t, t->size, k);
                        t->size++;
                }
 }
-void bch_bset_init_next(struct btree *b)
+/*
+ * Tries to merge l and r: l should be lower than r
+ * Returns true if we were able to merge. If we did merge, l will be the merged
+ * key, r will be untouched.
+ */
+bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r)
 {
-        struct bset *i = write_block(b);
+        if (!b->ops->key_merge)
+                return false;
-        if (i != b->sets[0].data) {
+        /*
-                b->sets[++b->nsets].data = i;
+         * Generic header checks
-                i->seq = b->sets[0].data->seq;
+         * Assumes left and right are in order
-        } else
+         * Left and right must be exactly aligned
-                get_random_bytes(&i->seq, sizeof(uint64_t));
+         */
+        if (!bch_bkey_equal_header(l, r) ||
+             bkey_cmp(l, &START_KEY(r)))
+                return false;
-        i->magic        = bset_magic(&b->c->sb);
+        return b->ops->key_merge(b, l, r);
-        i->version      = 0;
+}
-        i->keys         = 0;
+EXPORT_SYMBOL(bch_bkey_try_merge);
+void bch_bset_insert(struct btree_keys *b, struct bkey *where,
+                     struct bkey *insert)
+{
+        struct bset_tree *t = bset_tree_last(b);
+        BUG_ON(!b->last_set_unwritten);
+        BUG_ON(bset_byte_offset(b, t->data) +
+               __set_bytes(t->data, t->data->keys + bkey_u64s(insert)) >
+               PAGE_SIZE << b->page_order);
+        memmove((uint64_t *) where + bkey_u64s(insert),
+                where,
+                (void *) bset_bkey_last(t->data) - (void *) where);
+        t->data->keys += bkey_u64s(insert);
+        bkey_copy(where, insert);
+        bch_bset_fix_lookup_table(b, t, where);
+}
+EXPORT_SYMBOL(bch_bset_insert);
+unsigned bch_btree_insert_key(struct btree_keys *b, struct bkey *k,
+                              struct bkey *replace_key)
+{
+        unsigned status = BTREE_INSERT_STATUS_NO_INSERT;
+        struct bset *i = bset_tree_last(b)->data;
+        struct bkey *m, *prev = NULL;
+        struct btree_iter iter;
+        BUG_ON(b->ops->is_extents && !KEY_SIZE(k));
+        m = bch_btree_iter_init(b, &iter, b->ops->is_extents
+                                ? PRECEDING_KEY(&START_KEY(k))
+                                : PRECEDING_KEY(k));
+        if (b->ops->insert_fixup(b, k, &iter, replace_key))
+                return status;
-        bset_build_unwritten_tree(b);
+        status = BTREE_INSERT_STATUS_INSERT;
+        while (m != bset_bkey_last(i) &&
+               bkey_cmp(k, b->ops->is_extents ? &START_KEY(m) : m) > 0)
+                prev = m, m = bkey_next(m);
+        /* prev is in the tree, if we merge we're done */
+        status = BTREE_INSERT_STATUS_BACK_MERGE;
+        if (prev &&
+            bch_bkey_try_merge(b, prev, k))
+                goto merged;
+#if 0
+        status = BTREE_INSERT_STATUS_OVERWROTE;
+        if (m != bset_bkey_last(i) &&
+            KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
+                goto copy;
+#endif
+        status = BTREE_INSERT_STATUS_FRONT_MERGE;
+        if (m != bset_bkey_last(i) &&
+            bch_bkey_try_merge(b, k, m))
+                goto copy;
+        bch_bset_insert(b, m, k);
+copy:   bkey_copy(m, k);
+merged:
+        return status;
 }
+EXPORT_SYMBOL(bch_btree_insert_key);
+/* Lookup */
 struct bset_search_iter {
        struct bkey *l, *r;
 };
-static struct bset_search_iter bset_search_write_set(struct btree *b,
+static struct bset_search_iter bset_search_write_set(struct bset_tree *t,
-                                                     struct bset_tree *t,
                                                     const struct bkey *search)
 {
        unsigned li = 0, ri = t->size;
-        BUG_ON(!b->nsets &&
-               t->size < bkey_to_cacheline(t, end(t->data)));
        while (li + 1 != ri) {
                unsigned m = (li + ri) >> 1;
@@ -732,12 +884,11 @@ static struct bset_search_iter bset_search_write_set(struct btree *b,
        return (struct bset_search_iter) {
                table_to_bkey(t, li),
-                ri < t->size ? table_to_bkey(t, ri) : end(t->data)
+                ri < t->size ? table_to_bkey(t, ri) : bset_bkey_last(t->data)
        };
 }
-static struct bset_search_iter bset_search_tree(struct btree *b,
+static struct bset_search_iter bset_search_tree(struct bset_tree *t,
-                                                struct bset_tree *t,
                                                const struct bkey *search)
 {
        struct bkey *l, *r;
@@ -784,7 +935,7 @@ static struct bset_search_iter bset_search_tree(struct btree *b,
                        f = &t->tree[inorder_next(j, t->size)];
                        r = cacheline_to_bkey(t, inorder, f->m);
                } else
-                        r = end(t->data);
+                        r = bset_bkey_last(t->data);
        } else {
                r = cacheline_to_bkey(t, inorder, f->m);
@@ -798,7 +949,7 @@ static struct bset_search_iter bset_search_tree(struct btree *b,
        return (struct bset_search_iter) {l, r};
 }
-struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
+struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t,
                               const struct bkey *search)
 {
        struct bset_search_iter i;
@@ -820,7 +971,7 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
        if (unlikely(!t->size)) {
                i.l = t->data->start;
-                i.r = end(t->data);
+                i.r = bset_bkey_last(t->data);
        } else if (bset_written(b, t)) {
                /*
                 * Each node in the auxiliary search tree covers a certain range
@@ -830,23 +981,27 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
                 */
                if (unlikely(bkey_cmp(search, &t->end) >= 0))
-                        return end(t->data);
+                        return bset_bkey_last(t->data);
                if (unlikely(bkey_cmp(search, t->data->start) < 0))
                        return t->data->start;
-                i = bset_search_tree(b, t, search);
+                i = bset_search_tree(t, search);
-        } else
+        } else {
-                i = bset_search_write_set(b, t, search);
+                BUG_ON(!b->nsets &&
+                       t->size < bkey_to_cacheline(t, bset_bkey_last(t->data)));
-        if (expensive_debug_checks(b->c)) {
+                i = bset_search_write_set(t, search);
+        }
+        if (btree_keys_expensive_checks(b)) {
                BUG_ON(bset_written(b, t) &&
                       i.l != t->data->start &&
                       bkey_cmp(tree_to_prev_bkey(t,
                          inorder_to_tree(bkey_to_cacheline(t, i.l), t)),
                                search) > 0);
-                BUG_ON(i.r != end(t->data) &&
+                BUG_ON(i.r != bset_bkey_last(t->data) &&
                       bkey_cmp(i.r, search) <= 0);
        }
@@ -856,22 +1011,17 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
        return i.l;
 }
+EXPORT_SYMBOL(__bch_bset_search);
 /* Btree iterator */
-/*
+typedef bool (btree_iter_cmp_fn)(struct btree_iter_set,
- * Returns true if l > r - unless l == r, in which case returns true if l is
+                                 struct btree_iter_set);
- * older than r.
- *
- * Necessary for btree_sort_fixup() - if there are multiple keys that compare
- * equal in different sets, we have to process them newest to oldest.
- */
 static inline bool btree_iter_cmp(struct btree_iter_set l,
                                  struct btree_iter_set r)
 {
-        int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k));
+        return bkey_cmp(l.k, r.k) > 0;
-        return c ? c > 0 : l.k < r.k;
 }
 static inline bool btree_iter_end(struct btree_iter *iter)
@@ -888,8 +1038,10 @@ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k,
                                 btree_iter_cmp));
 }
-struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter,
+static struct bkey *__bch_btree_iter_init(struct btree_keys *b,
-                                   struct bkey *search, struct bset_tree *start)
+                                          struct btree_iter *iter,
+                                          struct bkey *search,
+                                          struct bset_tree *start)
 {
        struct bkey *ret = NULL;
        iter->size = ARRAY_SIZE(iter->data);
@@ -899,15 +1051,24 @@ struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter,
        iter->b = b;
 #endif
-        for (; start <= &b->sets[b->nsets]; start++) {
+        for (; start <= bset_tree_last(b); start++) {
                ret = bch_bset_search(b, start, search);
-                bch_btree_iter_push(iter, ret, end(start->data));
+                bch_btree_iter_push(iter, ret, bset_bkey_last(start->data));
        }
        return ret;
 }
-struct bkey *bch_btree_iter_next(struct btree_iter *iter)
+struct bkey *bch_btree_iter_init(struct btree_keys *b,
+                                 struct btree_iter *iter,
+                                 struct bkey *search)
+{
+        return __bch_btree_iter_init(b, iter, search, b->set);
+}
+EXPORT_SYMBOL(bch_btree_iter_init);
+static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter,
+                                                 btree_iter_cmp_fn *cmp)
 {
        struct btree_iter_set unused;
        struct bkey *ret = NULL;
@@ -924,16 +1085,23 @@ struct bkey *bch_btree_iter_next(struct btree_iter *iter)
                }
                if (iter->data->k == iter->data->end)
-                        heap_pop(iter, unused, btree_iter_cmp);
+                        heap_pop(iter, unused, cmp);
                else
-                        heap_sift(iter, 0, btree_iter_cmp);
+                        heap_sift(iter, 0, cmp);
        }
        return ret;
 }
+struct bkey *bch_btree_iter_next(struct btree_iter *iter)
+{
+        return __bch_btree_iter_next(iter, btree_iter_cmp);
+}
+EXPORT_SYMBOL(bch_btree_iter_next);
 struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
-                                        struct btree *b, ptr_filter_fn fn)
+                                        struct btree_keys *b, ptr_filter_fn fn)
 {
        struct bkey *ret;
@@ -946,70 +1114,58 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
 /* Mergesort */
-static void sort_key_next(struct btree_iter *iter,
+void bch_bset_sort_state_free(struct bset_sort_state *state)
-                          struct btree_iter_set *i)
 {
-        i->k = bkey_next(i->k);
+        if (state->pool)
+                mempool_destroy(state->pool);
-        if (i->k == i->end)
-                *i = iter->data[--iter->used];
 }
-static void btree_sort_fixup(struct btree_iter *iter)
+int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order)
 {
-        while (iter->used > 1) {
+        spin_lock_init(&state->time.lock);
-                struct btree_iter_set *top = iter->data, *i = top + 1;
-                if (iter->used > 2 &&
-                    btree_iter_cmp(i[0], i[1]))
-                        i++;
-                if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0)
-                        break;
-                if (!KEY_SIZE(i->k)) {
+        state->page_order = page_order;
-                        sort_key_next(iter, i);
+        state->crit_factor = int_sqrt(1 << page_order);
-                        heap_sift(iter, i - top, btree_iter_cmp);
-                        continue;
-                }
-                if (top->k > i->k) {
+        state->pool = mempool_create_page_pool(1, page_order);
-                        if (bkey_cmp(top->k, i->k) >= 0)
+        if (!state->pool)
-                                sort_key_next(iter, i);
+                return -ENOMEM;
-                        else
-                                bch_cut_front(top->k, i->k);
-                        heap_sift(iter, i - top, btree_iter_cmp);
+        return 0;
-                } else {
-                        /* can't happen because of comparison func */
-                        BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k)));
-                        bch_cut_back(&START_KEY(i->k), top->k);
-                }
-        }
 }
+EXPORT_SYMBOL(bch_bset_sort_state_init);
-static void btree_mergesort(struct btree *b, struct bset *out,
+static void btree_mergesort(struct btree_keys *b, struct bset *out,
                            struct btree_iter *iter,
                            bool fixup, bool remove_stale)
 {
+        int i;
        struct bkey *k, *last = NULL;
-        bool (*bad)(struct btree *, const struct bkey *) = remove_stale
+        BKEY_PADDED(k) tmp;
+        bool (*bad)(struct btree_keys *, const struct bkey *) = remove_stale
                ? bch_ptr_bad
                : bch_ptr_invalid;
+        /* Heapify the iterator, using our comparison function */
+        for (i = iter->used / 2 - 1; i >= 0; --i)
+                heap_sift(iter, i, b->ops->sort_cmp);
        while (!btree_iter_end(iter)) {
-                if (fixup && !b->level)
+                if (b->ops->sort_fixup && fixup)
-                        btree_sort_fixup(iter);
+                        k = b->ops->sort_fixup(iter, &tmp.k);
+                else
+                        k = NULL;
+                if (!k)
+                        k = __bch_btree_iter_next(iter, b->ops->sort_cmp);
-                k = bch_btree_iter_next(iter);
                if (bad(b, k))
                        continue;
                if (!last) {
                        last = out->start;
                        bkey_copy(last, k);
-                } else if (b->level ||
+                } else if (!bch_bkey_try_merge(b, last, k)) {
-                           !bch_bkey_try_merge(b, last, k)) {
                        last = bkey_next(last);
                        bkey_copy(last, k);
                }
@@ -1020,27 +1176,30 @@ static void btree_mergesort(struct btree *b, struct bset *out,
        pr_debug("sorted %i keys", out->keys);
 }
-static void __btree_sort(struct btree *b, struct btree_iter *iter,
+static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
-                         unsigned start, unsigned order, bool fixup)
+                         unsigned start, unsigned order, bool fixup,
+                         struct bset_sort_state *state)
 {
        uint64_t start_time;
-        bool remove_stale = !b->written;
+        bool used_mempool = false;
        struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO,
                                                     order);
        if (!out) {
-                mutex_lock(&b->c->sort_lock);
+                struct page *outp;
-                out = b->c->sort;
-                order = ilog2(bucket_pages(b->c));
+                BUG_ON(order > state->page_order);
+                outp = mempool_alloc(state->pool, GFP_NOIO);
+                out = page_address(outp);
+                used_mempool = true;
+                order = state->page_order;
        }
        start_time = local_clock();
-        btree_mergesort(b, out, iter, fixup, remove_stale);
+        btree_mergesort(b, out, iter, fixup, false);
        b->nsets = start;
-        if (!fixup && !start && b->written)
-                bch_btree_verify(b, out);
        if (!start && order == b->page_order) {
                /*
                 * Our temporary buffer is the same size as the btree node's
@@ -1048,84 +1207,76 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter,
                 * memcpy()
                 */
-                out->magic      = bset_magic(&b->c->sb);
+                out->magic      = b->set->data->magic;
-                out->seq        = b->sets[0].data->seq;
+                out->seq        = b->set->data->seq;
-                out->version    = b->sets[0].data->version;
+                out->version    = b->set->data->version;
-                swap(out, b->sets[0].data);
+                swap(out, b->set->data);
-                if (b->c->sort == b->sets[0].data)
-                        b->c->sort = out;
        } else {
-                b->sets[start].data->keys = out->keys;
+                b->set[start].data->keys = out->keys;
-                memcpy(b->sets[start].data->start, out->start,
+                memcpy(b->set[start].data->start, out->start,
-                       (void *) end(out) - (void *) out->start);
+                       (void *) bset_bkey_last(out) - (void *) out->start);
        }
-        if (out == b->c->sort)
+        if (used_mempool)
-                mutex_unlock(&b->c->sort_lock);
+                mempool_free(virt_to_page(out), state->pool);
        else
                free_pages((unsigned long) out, order);
-        if (b->written)
+        bch_bset_build_written_tree(b);
-                bset_build_written_tree(b);
        if (!start)
-                bch_time_stats_update(&b->c->sort_time, start_time);
+                bch_time_stats_update(&state->time, start_time);
 }
-void bch_btree_sort_partial(struct btree *b, unsigned start)
+void bch_btree_sort_partial(struct btree_keys *b, unsigned start,
+                            struct bset_sort_state *state)
 {
        size_t order = b->page_order, keys = 0;
        struct btree_iter iter;
        int oldsize = bch_count_data(b);
-        __bch_btree_iter_init(b, &iter, NULL, &b->sets[start]);
+        __bch_btree_iter_init(b, &iter, NULL, &b->set[start]);
-        BUG_ON(b->sets[b->nsets].data == write_block(b) &&
-               (b->sets[b->nsets].size || b->nsets));
        if (start) {
                unsigned i;
                for (i = start; i <= b->nsets; i++)
-                        keys += b->sets[i].data->keys;
+                        keys += b->set[i].data->keys;
-                order = roundup_pow_of_two(__set_bytes(b->sets->data,
+                order = get_order(__set_bytes(b->set->data, keys));
-                                                       keys)) / PAGE_SIZE;
-                if (order)
-                        order = ilog2(order);
        }
-        __btree_sort(b, &iter, start, order, false);
+        __btree_sort(b, &iter, start, order, false, state);
-        EBUG_ON(b->written && oldsize >= 0 && bch_count_data(b) != oldsize);
+        EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize);
 }
+EXPORT_SYMBOL(bch_btree_sort_partial);
-void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter)
+void bch_btree_sort_and_fix_extents(struct btree_keys *b,
+                                    struct btree_iter *iter,
+                                    struct bset_sort_state *state)
 {
-        BUG_ON(!b->written);
+        __btree_sort(b, iter, 0, b->page_order, true, state);
-        __btree_sort(b, iter, 0, b->page_order, true);
 }
-void bch_btree_sort_into(struct btree *b, struct btree *new)
+void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new,
+                         struct bset_sort_state *state)
 {
        uint64_t start_time = local_clock();
        struct btree_iter iter;
        bch_btree_iter_init(b, &iter, NULL);
-        btree_mergesort(b, new->sets->data, &iter, false, true);
+        btree_mergesort(b, new->set->data, &iter, false, true);
-        bch_time_stats_update(&b->c->sort_time, start_time);
+        bch_time_stats_update(&state->time, start_time);
-        bkey_copy_key(&new->key, &b->key);
+        new->set->size = 0; // XXX: why?
-        new->sets->size = 0;
 }
 #define SORT_CRIT       (4096 / sizeof(uint64_t))
-void bch_btree_sort_lazy(struct btree *b)
+void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state)
 {
        unsigned crit = SORT_CRIT;
        int i;
@@ -1134,50 +1285,32 @@ void bch_btree_sort_lazy(struct btree *b)
        if (!b->nsets)
                goto out;
-        /* If not a leaf node, always sort */
-        if (b->level) {
-                bch_btree_sort(b);
-                return;
-        }
        for (i = b->nsets - 1; i >= 0; --i) {
-                crit *= b->c->sort_crit_factor;
+                crit *= state->crit_factor;
-                if (b->sets[i].data->keys < crit) {
+                if (b->set[i].data->keys < crit) {
-                        bch_btree_sort_partial(b, i);
+                        bch_btree_sort_partial(b, i, state);
                        return;
                }
        }
        /* Sort if we'd overflow */
        if (b->nsets + 1 == MAX_BSETS) {
-                bch_btree_sort(b);
+                bch_btree_sort(b, state);
                return;
        }
 out:
-        bset_build_written_tree(b);
+        bch_bset_build_written_tree(b);
 }
+EXPORT_SYMBOL(bch_btree_sort_lazy);
-/* Sysfs stuff */
+void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *stats)
-struct bset_stats {
-        struct btree_op op;
-        size_t nodes;
-        size_t sets_written, sets_unwritten;
-        size_t bytes_written, bytes_unwritten;
-        size_t floats, failed;
-};
-static int btree_bset_stats(struct btree_op *op, struct btree *b)
 {
-        struct bset_stats *stats = container_of(op, struct bset_stats, op);
        unsigned i;
-        stats->nodes++;
        for (i = 0; i <= b->nsets; i++) {
-                struct bset_tree *t = &b->sets[i];
+                struct bset_tree *t = &b->set[i];
                size_t bytes = t->data->keys * sizeof(uint64_t);
                size_t j;
@@ -1195,32 +1328,4 @@ static int btree_bset_stats(struct btree_op *op, struct btree *b)
                        stats->bytes_unwritten += bytes;
                }
        }
-        return MAP_CONTINUE;
-}
-int bch_bset_print_stats(struct cache_set *c, char *buf)
-{
-        struct bset_stats t;
-        int ret;
-        memset(&t, 0, sizeof(struct bset_stats));
-        bch_btree_op_init(&t.op, -1);
-        ret = bch_btree_map_nodes(&t.op, c, &ZERO_KEY, btree_bset_stats);
-        if (ret < 0)
-                return ret;
-        return snprintf(buf, PAGE_SIZE,
-                        "btree nodes:           %zu\n"
-                        "written sets:          %zu\n"
-                        "unwritten sets:                %zu\n"
-                        "written key bytes:     %zu\n"
-                        "unwritten key bytes:   %zu\n"
-                        "floats:                        %zu\n"
-                        "failed:                        %zu\n",
-                        t.nodes,
-                        t.sets_written, t.sets_unwritten,
-                        t.bytes_written, t.bytes_unwritten,
-                        t.floats, t.failed);
 }
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index 1d3c24f9fa0e..003260f4ddf6 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -1,7 +1,11 @@
 #ifndef _BCACHE_BSET_H
 #define _BCACHE_BSET_H
-#include <linux/slab.h>
+#include <linux/bcache.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include "util.h" /* for time_stats */
 /*
 * BKEYS:
@@ -142,20 +146,13 @@
 * first key in that range of bytes again.
 */
-/* Btree key comparison/iteration */
+struct btree_keys;
+struct btree_iter;
+struct btree_iter_set;
+struct bkey_float;
 #define MAX_BSETS               4U
-struct btree_iter {
-        size_t size, used;
-#ifdef CONFIG_BCACHE_DEBUG
-        struct btree *b;
-#endif
-        struct btree_iter_set {
-                struct bkey *k, *end;
-        } data[MAX_BSETS];
-};
 struct bset_tree {
        /*
         * We construct a binary tree in an array as if the array
@@ -165,14 +162,14 @@ struct bset_tree {
         */
        /* size of the binary tree and prev array */
-        unsigned        size;
+        unsigned                size;
        /* function of size - precalculated for to_inorder() */
-        unsigned        extra;
+        unsigned                extra;
        /* copy of the last key in the set */
-        struct bkey     end;
+        struct bkey             end;
-        struct bkey_float *tree;
+        struct bkey_float       *tree;
        /*
         * The nodes in the bset tree point to specific keys - this
@@ -182,12 +179,219 @@ struct bset_tree {
         * to keep bkey_float to 4 bytes and prev isn't used in the fast
         * path.
         */
-        uint8_t         *prev;
+        uint8_t                 *prev;
        /* The actual btree node, with pointers to each sorted set */
-        struct bset     *data;
+        struct bset             *data;
+};
+struct btree_keys_ops {
+        bool            (*sort_cmp)(struct btree_iter_set,
+                                    struct btree_iter_set);
+        struct bkey     *(*sort_fixup)(struct btree_iter *, struct bkey *);
+        bool            (*insert_fixup)(struct btree_keys *, struct bkey *,
+                                        struct btree_iter *, struct bkey *);
+        bool            (*key_invalid)(struct btree_keys *,
+                                       const struct bkey *);
+        bool            (*key_bad)(struct btree_keys *, const struct bkey *);
+        bool            (*key_merge)(struct btree_keys *,
+                                     struct bkey *, struct bkey *);
+        void            (*key_to_text)(char *, size_t, const struct bkey *);
+        void            (*key_dump)(struct btree_keys *, const struct bkey *);
+        /*
+         * Only used for deciding whether to use START_KEY(k) or just the key
+         * itself in a couple places
+         */
+        bool            is_extents;
+};
+struct btree_keys {
+        const struct btree_keys_ops     *ops;
+        uint8_t                 page_order;
+        uint8_t                 nsets;
+        unsigned                last_set_unwritten:1;
+        bool                    *expensive_debug_checks;
+        /*
+         * Sets of sorted keys - the real btree node - plus a binary search tree
+         *
+         * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
+         * to the memory we have allocated for this btree node. Additionally,
+         * set[0]->data points to the entire btree node as it exists on disk.
+         */
+        struct bset_tree        set[MAX_BSETS];
+};
+static inline struct bset_tree *bset_tree_last(struct btree_keys *b)
+{
+        return b->set + b->nsets;
+}
+static inline bool bset_written(struct btree_keys *b, struct bset_tree *t)
+{
+        return t <= b->set + b->nsets - b->last_set_unwritten;
+}
+static inline bool bkey_written(struct btree_keys *b, struct bkey *k)
+{
+        return !b->last_set_unwritten || k < b->set[b->nsets].data->start;
+}
+static inline unsigned bset_byte_offset(struct btree_keys *b, struct bset *i)
+{
+        return ((size_t) i) - ((size_t) b->set->data);
+}
+static inline unsigned bset_sector_offset(struct btree_keys *b, struct bset *i)
+{
+        return bset_byte_offset(b, i) >> 9;
+}
+#define __set_bytes(i, k)       (sizeof(*(i)) + (k) * sizeof(uint64_t))
+#define set_bytes(i)            __set_bytes(i, i->keys)
+#define __set_blocks(i, k, block_bytes)                         \
+        DIV_ROUND_UP(__set_bytes(i, k), block_bytes)
+#define set_blocks(i, block_bytes)                              \
+        __set_blocks(i, (i)->keys, block_bytes)
+static inline size_t bch_btree_keys_u64s_remaining(struct btree_keys *b)
+{
+        struct bset_tree *t = bset_tree_last(b);
+        BUG_ON((PAGE_SIZE << b->page_order) <
+               (bset_byte_offset(b, t->data) + set_bytes(t->data)));
+        if (!b->last_set_unwritten)
+                return 0;
+        return ((PAGE_SIZE << b->page_order) -
+                (bset_byte_offset(b, t->data) + set_bytes(t->data))) /
+                sizeof(u64);
+}
+static inline struct bset *bset_next_set(struct btree_keys *b,
+                                         unsigned block_bytes)
+{
+        struct bset *i = bset_tree_last(b)->data;
+        return ((void *) i) + roundup(set_bytes(i), block_bytes);
+}
+void bch_btree_keys_free(struct btree_keys *);
+int bch_btree_keys_alloc(struct btree_keys *, unsigned, gfp_t);
+void bch_btree_keys_init(struct btree_keys *, const struct btree_keys_ops *,
+                         bool *);
+void bch_bset_init_next(struct btree_keys *, struct bset *, uint64_t);
+void bch_bset_build_written_tree(struct btree_keys *);
+void bch_bset_fix_invalidated_key(struct btree_keys *, struct bkey *);
+bool bch_bkey_try_merge(struct btree_keys *, struct bkey *, struct bkey *);
+void bch_bset_insert(struct btree_keys *, struct bkey *, struct bkey *);
+unsigned bch_btree_insert_key(struct btree_keys *, struct bkey *,
+                              struct bkey *);
+enum {
+        BTREE_INSERT_STATUS_NO_INSERT = 0,
+        BTREE_INSERT_STATUS_INSERT,
+        BTREE_INSERT_STATUS_BACK_MERGE,
+        BTREE_INSERT_STATUS_OVERWROTE,
+        BTREE_INSERT_STATUS_FRONT_MERGE,
 };
+/* Btree key iteration */
+struct btree_iter {
+        size_t size, used;
+#ifdef CONFIG_BCACHE_DEBUG
+        struct btree_keys *b;
+#endif
+        struct btree_iter_set {
+                struct bkey *k, *end;
+        } data[MAX_BSETS];
+};
+typedef bool (*ptr_filter_fn)(struct btree_keys *, const struct bkey *);
+struct bkey *bch_btree_iter_next(struct btree_iter *);
+struct bkey *bch_btree_iter_next_filter(struct btree_iter *,
+                                        struct btree_keys *, ptr_filter_fn);
+void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *);
+struct bkey *bch_btree_iter_init(struct btree_keys *, struct btree_iter *,
+                                 struct bkey *);
+struct bkey *__bch_bset_search(struct btree_keys *, struct bset_tree *,
+                               const struct bkey *);
+/*
+ * Returns the first key that is strictly greater than search
+ */
+static inline struct bkey *bch_bset_search(struct btree_keys *b,
+                                           struct bset_tree *t,
+                                           const struct bkey *search)
+{
+        return search ? __bch_bset_search(b, t, search) : t->data->start;
+}
+#define for_each_key_filter(b, k, iter, filter)                         \
+        for (bch_btree_iter_init((b), (iter), NULL);                    \
+             ((k) = bch_btree_iter_next_filter((iter), (b), filter));)
+#define for_each_key(b, k, iter)                                        \
+        for (bch_btree_iter_init((b), (iter), NULL);                    \
+             ((k) = bch_btree_iter_next(iter));)
+/* Sorting */
+struct bset_sort_state {
+        mempool_t               *pool;
+        unsigned                page_order;
+        unsigned                crit_factor;
+        struct time_stats       time;
+};
+void bch_bset_sort_state_free(struct bset_sort_state *);
+int bch_bset_sort_state_init(struct bset_sort_state *, unsigned);
+void bch_btree_sort_lazy(struct btree_keys *, struct bset_sort_state *);
+void bch_btree_sort_into(struct btree_keys *, struct btree_keys *,
+                         struct bset_sort_state *);
+void bch_btree_sort_and_fix_extents(struct btree_keys *, struct btree_iter *,
+                                    struct bset_sort_state *);
+void bch_btree_sort_partial(struct btree_keys *, unsigned,
+                            struct bset_sort_state *);
+static inline void bch_btree_sort(struct btree_keys *b,
+                                  struct bset_sort_state *state)
+{
+        bch_btree_sort_partial(b, 0, state);
+}
+struct bset_stats {
+        size_t sets_written, sets_unwritten;
+        size_t bytes_written, bytes_unwritten;
+        size_t floats, failed;
+};
+void bch_btree_keys_stats(struct btree_keys *, struct bset_stats *);
+/* Bkey utility code */
+#define bset_bkey_last(i)       bkey_idx((struct bkey *) (i)->d, (i)->keys)
+static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned idx)
+{
+        return bkey_idx(i->start, idx);
+}
+static inline void bkey_init(struct bkey *k)
+{
+        *k = ZERO_KEY;
+}
 static __always_inline int64_t bkey_cmp(const struct bkey *l,
                                        const struct bkey *r)
 {
@@ -196,6 +400,62 @@ static __always_inline int64_t bkey_cmp(const struct bkey *l,
                : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r);
 }
+void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *,
+                              unsigned);
+bool __bch_cut_front(const struct bkey *, struct bkey *);
+bool __bch_cut_back(const struct bkey *, struct bkey *);
+static inline bool bch_cut_front(const struct bkey *where, struct bkey *k)
+{
+        BUG_ON(bkey_cmp(where, k) > 0);
+        return __bch_cut_front(where, k);
+}
+static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
+{
+        BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0);
+        return __bch_cut_back(where, k);
+}
+#define PRECEDING_KEY(_k)                                       \
+({                                                              \
+        struct bkey *_ret = NULL;                               \
+                                                                \
+        if (KEY_INODE(_k) || KEY_OFFSET(_k)) {                  \
+                _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0);  \
+                                                                \
+                if (!_ret->low)                                 \
+                        _ret->high--;                           \
+                _ret->low--;                                    \
+        }                                                       \
+                                                                \
+        _ret;                                                   \
+})
+static inline bool bch_ptr_invalid(struct btree_keys *b, const struct bkey *k)
+{
+        return b->ops->key_invalid(b, k);
+}
+static inline bool bch_ptr_bad(struct btree_keys *b, const struct bkey *k)
+{
+        return b->ops->key_bad(b, k);
+}
+static inline void bch_bkey_to_text(struct btree_keys *b, char *buf,
+                                    size_t size, const struct bkey *k)
+{
+        return b->ops->key_to_text(buf, size, k);
+}
+static inline bool bch_bkey_equal_header(const struct bkey *l,
+                                         const struct bkey *r)
+{
+        return (KEY_DIRTY(l) == KEY_DIRTY(r) &&
+                KEY_PTRS(l) == KEY_PTRS(r) &&
+                KEY_CSUM(l) == KEY_CSUM(l));
+}
 /* Keylists */
 struct keylist {
@@ -257,136 +517,44 @@ static inline size_t bch_keylist_bytes(struct keylist *l)
 struct bkey *bch_keylist_pop(struct keylist *);
 void bch_keylist_pop_front(struct keylist *);
-int bch_keylist_realloc(struct keylist *, int, struct cache_set *);
+int __bch_keylist_realloc(struct keylist *, unsigned);
-void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *,
-                              unsigned);
-bool __bch_cut_front(const struct bkey *, struct bkey *);
-bool __bch_cut_back(const struct bkey *, struct bkey *);
-static inline bool bch_cut_front(const struct bkey *where, struct bkey *k)
+/* Debug stuff */
-{
-        BUG_ON(bkey_cmp(where, k) > 0);
-        return __bch_cut_front(where, k);
-}
-static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
+#ifdef CONFIG_BCACHE_DEBUG
-{
-        BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0);
-        return __bch_cut_back(where, k);
-}
-const char *bch_ptr_status(struct cache_set *, const struct bkey *);
-bool bch_btree_ptr_invalid(struct cache_set *, const struct bkey *);
-bool bch_extent_ptr_invalid(struct cache_set *, const struct bkey *);
-bool bch_ptr_bad(struct btree *, const struct bkey *);
-static inline uint8_t gen_after(uint8_t a, uint8_t b)
-{
-        uint8_t r = a - b;
-        return r > 128U ? 0 : r;
-}
-static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k,
-                                unsigned i)
-{
-        return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i));
-}
-static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
-                                 unsigned i)
-{
-        return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i);
-}
-typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *);
-struct bkey *bch_btree_iter_next(struct btree_iter *);
-struct bkey *bch_btree_iter_next_filter(struct btree_iter *,
-                                        struct btree *, ptr_filter_fn);
-void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *);
-struct bkey *__bch_btree_iter_init(struct btree *, struct btree_iter *,
-                                   struct bkey *, struct bset_tree *);
-/* 32 bits total: */
-#define BKEY_MID_BITS           3
-#define BKEY_EXPONENT_BITS      7
-#define BKEY_MANTISSA_BITS      22
-#define BKEY_MANTISSA_MASK      ((1 << BKEY_MANTISSA_BITS) - 1)
-struct bkey_float {
-        unsigned        exponent:BKEY_EXPONENT_BITS;
-        unsigned        m:BKEY_MID_BITS;
-        unsigned        mantissa:BKEY_MANTISSA_BITS;
-} __packed;
-/*
- * BSET_CACHELINE was originally intended to match the hardware cacheline size -
- * it used to be 64, but I realized the lookup code would touch slightly less
- * memory if it was 128.
- *
- * It definites the number of bytes (in struct bset) per struct bkey_float in
- * the auxiliar search tree - when we're done searching the bset_float tree we
- * have this many bytes left that we do a linear search over.
- *
- * Since (after level 5) every level of the bset_tree is on a new cacheline,
- * we're touching one fewer cacheline in the bset tree in exchange for one more
- * cacheline in the linear search - but the linear search might stop before it
- * gets to the second cacheline.
- */
-#define BSET_CACHELINE          128
-#define bset_tree_space(b)      (btree_data_space(b) / BSET_CACHELINE)
-#define bset_tree_bytes(b)      (bset_tree_space(b) * sizeof(struct bkey_float))
+int __bch_count_data(struct btree_keys *);
-#define bset_prev_bytes(b)      (bset_tree_space(b) * sizeof(uint8_t))
+void __bch_check_keys(struct btree_keys *, const char *, ...);
+void bch_dump_bset(struct btree_keys *, struct bset *, unsigned);
+void bch_dump_bucket(struct btree_keys *);
-void bch_bset_init_next(struct btree *);
+#else
-void bch_bset_fix_invalidated_key(struct btree *, struct bkey *);
+static inline int __bch_count_data(struct btree_keys *b) { return -1; }
-void bch_bset_fix_lookup_table(struct btree *, struct bkey *);
+static inline void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) {}
+static inline void bch_dump_bucket(struct btree_keys *b) {}
+void bch_dump_bset(struct btree_keys *, struct bset *, unsigned);
-struct bkey *__bch_bset_search(struct btree *, struct bset_tree *,
+#endif
-                           const struct bkey *);
-/*
+static inline bool btree_keys_expensive_checks(struct btree_keys *b)
- * Returns the first key that is strictly greater than search
- */
-static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t,
-                                           const struct bkey *search)
 {
-        return search ? __bch_bset_search(b, t, search) : t->data->start;
+#ifdef CONFIG_BCACHE_DEBUG
+        return *b->expensive_debug_checks;
+#else
+        return false;
+#endif
 }
-#define PRECEDING_KEY(_k)                                       \
+static inline int bch_count_data(struct btree_keys *b)
-({                                                              \
-        struct bkey *_ret = NULL;                               \
-                                                                \
-        if (KEY_INODE(_k) || KEY_OFFSET(_k)) {                  \
-                _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0);  \
-                                                                \
-                if (!_ret->low)                                 \
-                        _ret->high--;                           \
-                _ret->low--;                                    \
-        }                                                       \
-                                                                \
-        _ret;                                                   \
-})
-bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *);
-void bch_btree_sort_lazy(struct btree *);
-void bch_btree_sort_into(struct btree *, struct btree *);
-void bch_btree_sort_and_fix_extents(struct btree *, struct btree_iter *);
-void bch_btree_sort_partial(struct btree *, unsigned);
-static inline void bch_btree_sort(struct btree *b)
 {
-        bch_btree_sort_partial(b, 0);
+        return btree_keys_expensive_checks(b) ? __bch_count_data(b) : -1;
 }
-int bch_bset_print_stats(struct cache_set *, char *);
+#define bch_check_keys(b, ...)                                          \
+do {                                                                    \
+        if (btree_keys_expensive_checks(b))                             \
+                __bch_check_keys(b, __VA_ARGS__);                       \
+} while (0)
 #endif
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 31bb53fcc67a..5f9c2a665ca5 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -23,7 +23,7 @@
 #include "bcache.h"
 #include "btree.h"
 #include "debug.h"
-#include "writeback.h"
+#include "extents.h"
 #include <linux/slab.h>
 #include <linux/bitops.h>
@@ -89,13 +89,6 @@
 * Test module load/unload
 */
-enum {
-        BTREE_INSERT_STATUS_INSERT,
-        BTREE_INSERT_STATUS_BACK_MERGE,
-        BTREE_INSERT_STATUS_OVERWROTE,
-        BTREE_INSERT_STATUS_FRONT_MERGE,
-};
 #define MAX_NEED_GC             64
 #define MAX_SAVE_PRIO           72
@@ -106,14 +99,6 @@ enum {
 static struct workqueue_struct *btree_io_wq;
-static inline bool should_split(struct btree *b)
-{
-        struct bset *i = write_block(b);
-        return b->written >= btree_blocks(b) ||
-                (b->written + __set_blocks(i, i->keys + 15, b->c)
-                 > btree_blocks(b));
-}
 #define insert_lock(s, b)       ((b)->level <= (s)->lock)
 /*
@@ -167,6 +152,8 @@ static inline bool should_split(struct btree *b)
                        _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);   \
                }                                                       \
                rw_unlock(_w, _b);                                      \
+                if (_r == -EINTR)                                       \
+                        schedule();                                     \
                bch_cannibalize_unlock(c);                              \
                if (_r == -ENOSPC) {                                    \
                        wait_event((c)->try_wait,                       \
@@ -175,9 +162,15 @@ static inline bool should_split(struct btree *b)
                }                                                       \
        } while (_r == -EINTR);                                         \
                                                                        \
+        finish_wait(&(c)->bucket_wait, &(op)->wait);                    \
        _r;                                                             \
 })
+static inline struct bset *write_block(struct btree *b)
+{
+        return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c);
+}
 /* Btree key manipulation */
 void bkey_put(struct cache_set *c, struct bkey *k)
@@ -194,16 +187,16 @@ void bkey_put(struct cache_set *c, struct bkey *k)
 static uint64_t btree_csum_set(struct btree *b, struct bset *i)
 {
        uint64_t crc = b->key.ptr[0];
-        void *data = (void *) i + 8, *end = end(i);
+        void *data = (void *) i + 8, *end = bset_bkey_last(i);
        crc = bch_crc64_update(crc, data, end - data);
        return crc ^ 0xffffffffffffffffULL;
 }
-static void bch_btree_node_read_done(struct btree *b)
+void bch_btree_node_read_done(struct btree *b)
 {
        const char *err = "bad btree header";
-        struct bset *i = b->sets[0].data;
+        struct bset *i = btree_bset_first(b);
        struct btree_iter *iter;
        iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
@@ -211,21 +204,22 @@ static void bch_btree_node_read_done(struct btree *b)
        iter->used = 0;
 #ifdef CONFIG_BCACHE_DEBUG
-        iter->b = b;
+        iter->b = &b->keys;
 #endif
        if (!i->seq)
                goto err;
        for (;
-             b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq;
+             b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq;
             i = write_block(b)) {
                err = "unsupported bset version";
                if (i->version > BCACHE_BSET_VERSION)
                        goto err;
                err = "bad btree header";
-                if (b->written + set_blocks(i, b->c) > btree_blocks(b))
+                if (b->written + set_blocks(i, block_bytes(b->c)) >
+                    btree_blocks(b))
                        goto err;
                err = "bad magic";
@@ -245,39 +239,40 @@ static void bch_btree_node_read_done(struct btree *b)
                }
                err = "empty set";
-                if (i != b->sets[0].data && !i->keys)
+                if (i != b->keys.set[0].data && !i->keys)
                        goto err;
-                bch_btree_iter_push(iter, i->start, end(i));
+                bch_btree_iter_push(iter, i->start, bset_bkey_last(i));
-                b->written += set_blocks(i, b->c);
+                b->written += set_blocks(i, block_bytes(b->c));
        }
        err = "corrupted btree";
        for (i = write_block(b);
-             index(i, b) < btree_blocks(b);
+             bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key);
             i = ((void *) i) + block_bytes(b->c))
-                if (i->seq == b->sets[0].data->seq)
+                if (i->seq == b->keys.set[0].data->seq)
                        goto err;
-        bch_btree_sort_and_fix_extents(b, iter);
+        bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort);
-        i = b->sets[0].data;
+        i = b->keys.set[0].data;
        err = "short btree key";
-        if (b->sets[0].size &&
+        if (b->keys.set[0].size &&
-            bkey_cmp(&b->key, &b->sets[0].end) < 0)
+            bkey_cmp(&b->key, &b->keys.set[0].end) < 0)
                goto err;
        if (b->written < btree_blocks(b))
-                bch_bset_init_next(b);
+                bch_bset_init_next(&b->keys, write_block(b),
+                                   bset_magic(&b->c->sb));
 out:
        mempool_free(iter, b->c->fill_iter);
        return;
 err:
        set_btree_node_io_error(b);
-        bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys",
+        bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys",
                            err, PTR_BUCKET_NR(b->c, &b->key, 0),
-                            index(i, b), i->keys);
+                            bset_block_offset(b, i), i->keys);
        goto out;
 }
@@ -287,7 +282,7 @@ static void btree_node_read_endio(struct bio *bio, int error)
        closure_put(cl);
 }
-void bch_btree_node_read(struct btree *b)
+static void bch_btree_node_read(struct btree *b)
 {
        uint64_t start_time = local_clock();
        struct closure cl;
@@ -299,11 +294,11 @@ void bch_btree_node_read(struct btree *b)
        bio = bch_bbio_alloc(b->c);
        bio->bi_rw      = REQ_META|READ_SYNC;
-        bio->bi_size    = KEY_SIZE(&b->key) << 9;
+        bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9;
        bio->bi_end_io  = btree_node_read_endio;
        bio->bi_private = &cl;
-        bch_bio_map(bio, b->sets[0].data);
+        bch_bio_map(bio, b->keys.set[0].data);
        bch_submit_bbio(bio, b->c, &b->key, 0);
        closure_sync(&cl);
@@ -340,9 +335,16 @@ static void btree_complete_write(struct btree *b, struct btree_write *w)
        w->journal      = NULL;
 }
+static void btree_node_write_unlock(struct closure *cl)
+{
+        struct btree *b = container_of(cl, struct btree, io);
+        up(&b->io_mutex);
+}
 static void __btree_node_write_done(struct closure *cl)
 {
-        struct btree *b = container_of(cl, struct btree, io.cl);
+        struct btree *b = container_of(cl, struct btree, io);
        struct btree_write *w = btree_prev_write(b);
        bch_bbio_free(b->bio, b->c);
@@ -353,16 +355,16 @@ static void __btree_node_write_done(struct closure *cl)
                queue_delayed_work(btree_io_wq, &b->work,
                                   msecs_to_jiffies(30000));
-        closure_return(cl);
+        closure_return_with_destructor(cl, btree_node_write_unlock);
 }
 static void btree_node_write_done(struct closure *cl)
 {
-        struct btree *b = container_of(cl, struct btree, io.cl);
+        struct btree *b = container_of(cl, struct btree, io);
        struct bio_vec *bv;
        int n;
-        __bio_for_each_segment(bv, b->bio, n, 0)
+        bio_for_each_segment_all(bv, b->bio, n)
                __free_page(bv->bv_page);
        __btree_node_write_done(cl);
@@ -371,7 +373,7 @@ static void btree_node_write_done(struct closure *cl)
 static void btree_node_write_endio(struct bio *bio, int error)
 {
        struct closure *cl = bio->bi_private;
-        struct btree *b = container_of(cl, struct btree, io.cl);
+        struct btree *b = container_of(cl, struct btree, io);
        if (error)
                set_btree_node_io_error(b);
@@ -382,8 +384,8 @@ static void btree_node_write_endio(struct bio *bio, int error)
 static void do_btree_node_write(struct btree *b)
 {
-        struct closure *cl = &b->io.cl;
+        struct closure *cl = &b->io;
-        struct bset *i = b->sets[b->nsets].data;
+        struct bset *i = btree_bset_last(b);
        BKEY_PADDED(key) k;
        i->version      = BCACHE_BSET_VERSION;
@@ -395,7 +397,7 @@ static void do_btree_node_write(struct btree *b)
        b->bio->bi_end_io       = btree_node_write_endio;
        b->bio->bi_private      = cl;
        b->bio->bi_rw           = REQ_META|WRITE_SYNC|REQ_FUA;
-        b->bio->bi_size         = set_blocks(i, b->c) * block_bytes(b->c);
+        b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c));
        bch_bio_map(b->bio, i);
        /*
@@ -414,14 +416,15 @@ static void do_btree_node_write(struct btree *b)
         */
        bkey_copy(&k.key, &b->key);
-        SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i));
+        SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
+                       bset_sector_offset(&b->keys, i));
        if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
                int j;
                struct bio_vec *bv;
                void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
-                bio_for_each_segment(bv, b->bio, j)
+                bio_for_each_segment_all(bv, b->bio, j)
                        memcpy(page_address(bv->bv_page),
                               base + j * PAGE_SIZE, PAGE_SIZE);
@@ -435,40 +438,54 @@ static void do_btree_node_write(struct btree *b)
                bch_submit_bbio(b->bio, b->c, &k.key, 0);
                closure_sync(cl);
-                __btree_node_write_done(cl);
+                continue_at_nobarrier(cl, __btree_node_write_done, NULL);
        }
 }
 void bch_btree_node_write(struct btree *b, struct closure *parent)
 {
-        struct bset *i = b->sets[b->nsets].data;
+        struct bset *i = btree_bset_last(b);
        trace_bcache_btree_write(b);
        BUG_ON(current->bio_list);
        BUG_ON(b->written >= btree_blocks(b));
        BUG_ON(b->written && !i->keys);
-        BUG_ON(b->sets->data->seq != i->seq);
+        BUG_ON(btree_bset_first(b)->seq != i->seq);
-        bch_check_keys(b, "writing");
+        bch_check_keys(&b->keys, "writing");
        cancel_delayed_work(&b->work);
        /* If caller isn't waiting for write, parent refcount is cache set */
-        closure_lock(&b->io, parent ?: &b->c->cl);
+        down(&b->io_mutex);
+        closure_init(&b->io, parent ?: &b->c->cl);
        clear_bit(BTREE_NODE_dirty,      &b->flags);
        change_bit(BTREE_NODE_write_idx, &b->flags);
        do_btree_node_write(b);
-        b->written += set_blocks(i, b->c);
+        atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size,
-        atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size,
                        &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
-        bch_btree_sort_lazy(b);
+        b->written += set_blocks(i, block_bytes(b->c));
+        /* If not a leaf node, always sort */
+        if (b->level && b->keys.nsets)
+                bch_btree_sort(&b->keys, &b->c->sort);
+        else
+                bch_btree_sort_lazy(&b->keys, &b->c->sort);
+        /*
+         * do verify if there was more than one set initially (i.e. we did a
+         * sort) and we sorted down to a single set:
+         */
+        if (i != b->keys.set->data && !b->keys.nsets)
+                bch_btree_verify(b);
        if (b->written < btree_blocks(b))
-                bch_bset_init_next(b);
+                bch_bset_init_next(&b->keys, write_block(b),
+                                   bset_magic(&b->c->sb));
 }
 static void bch_btree_node_write_sync(struct btree *b)
@@ -493,7 +510,7 @@ static void btree_node_write_work(struct work_struct *w)
 static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
 {
-        struct bset *i = b->sets[b->nsets].data;
+        struct bset *i = btree_bset_last(b);
        struct btree_write *w = btree_current_write(b);
        BUG_ON(!b->written);
@@ -528,24 +545,6 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
 * mca -> memory cache
 */
-static void mca_reinit(struct btree *b)
-{
-        unsigned i;
-        b->flags        = 0;
-        b->written      = 0;
-        b->nsets        = 0;
-        for (i = 0; i < MAX_BSETS; i++)
-                b->sets[i].size = 0;
-        /*
-         * Second loop starts at 1 because b->sets[0]->data is the memory we
-         * allocated
-         */
-        for (i = 1; i < MAX_BSETS; i++)
-                b->sets[i].data = NULL;
-}
 #define mca_reserve(c)  (((c->root && c->root->level)           \
                          ? c->root->level : 1) * 8 + 16)
 #define mca_can_free(c)                                         \
@@ -553,28 +552,12 @@ static void mca_reinit(struct btree *b)
 static void mca_data_free(struct btree *b)
 {
-        struct bset_tree *t = b->sets;
+        BUG_ON(b->io_mutex.count != 1);
-        BUG_ON(!closure_is_unlocked(&b->io.cl));
-        if (bset_prev_bytes(b) < PAGE_SIZE)
+        bch_btree_keys_free(&b->keys);
-                kfree(t->prev);
-        else
-                free_pages((unsigned long) t->prev,
-                           get_order(bset_prev_bytes(b)));
-        if (bset_tree_bytes(b) < PAGE_SIZE)
-                kfree(t->tree);
-        else
-                free_pages((unsigned long) t->tree,
-                           get_order(bset_tree_bytes(b)));
-        free_pages((unsigned long) t->data, b->page_order);
-        t->prev = NULL;
-        t->tree = NULL;
-        t->data = NULL;
-        list_move(&b->list, &b->c->btree_cache_freed);
        b->c->bucket_cache_used--;
+        list_move(&b->list, &b->c->btree_cache_freed);
 }
 static void mca_bucket_free(struct btree *b)
@@ -593,34 +576,16 @@ static unsigned btree_order(struct bkey *k)
 static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
 {
-        struct bset_tree *t = b->sets;
+        if (!bch_btree_keys_alloc(&b->keys,
-        BUG_ON(t->data);
+                                  max_t(unsigned,
+                                        ilog2(b->c->btree_pages),
-        b->page_order = max_t(unsigned,
+                                        btree_order(k)),
-                              ilog2(b->c->btree_pages),
+                                  gfp)) {
-                              btree_order(k));
+                b->c->bucket_cache_used++;
+                list_move(&b->list, &b->c->btree_cache);
-        t->data = (void *) __get_free_pages(gfp, b->page_order);
+        } else {
-        if (!t->data)
+                list_move(&b->list, &b->c->btree_cache_freed);
-                goto err;
+        }
-        t->tree = bset_tree_bytes(b) < PAGE_SIZE
-                ? kmalloc(bset_tree_bytes(b), gfp)
-                : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
-        if (!t->tree)
-                goto err;
-        t->prev = bset_prev_bytes(b) < PAGE_SIZE
-                ? kmalloc(bset_prev_bytes(b), gfp)
-                : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
-        if (!t->prev)
-                goto err;
-        list_move(&b->list, &b->c->btree_cache);
-        b->c->bucket_cache_used++;
-        return;
-err:
-        mca_data_free(b);
 }
 static struct btree *mca_bucket_alloc(struct cache_set *c,
@@ -635,7 +600,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
        INIT_LIST_HEAD(&b->list);
        INIT_DELAYED_WORK(&b->work, btree_node_write_work);
        b->c = c;
-        closure_init_unlocked(&b->io);
+        sema_init(&b->io_mutex, 1);
        mca_data_alloc(b, k, gfp);
        return b;
@@ -651,24 +616,31 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush)
        if (!down_write_trylock(&b->lock))
                return -ENOMEM;
-        BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
+        BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data);
-        if (b->page_order < min_order ||
+        if (b->keys.page_order < min_order)
-            (!flush &&
+                goto out_unlock;
-             (btree_node_dirty(b) ||
-              atomic_read(&b->io.cl.remaining) != -1))) {
+        if (!flush) {
-                rw_unlock(true, b);
+                if (btree_node_dirty(b))
-                return -ENOMEM;
+                        goto out_unlock;
+                if (down_trylock(&b->io_mutex))
+                        goto out_unlock;
+                up(&b->io_mutex);
        }
        if (btree_node_dirty(b))
                bch_btree_node_write_sync(b);
        /* wait for any in flight btree write */
-        closure_wait_event(&b->io.wait, &cl,
+        down(&b->io_mutex);
-                           atomic_read(&b->io.cl.remaining) == -1);
+        up(&b->io_mutex);
        return 0;
+out_unlock:
+        rw_unlock(true, b);
+        return -ENOMEM;
 }
 static unsigned long bch_mca_scan(struct shrinker *shrink,
@@ -714,14 +686,10 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
                }
        }
-        /*
-         * Can happen right when we first start up, before we've read in any
-         * btree nodes
-         */
-        if (list_empty(&c->btree_cache))
-                goto out;
        for (i = 0; (nr--) && i < c->bucket_cache_used; i++) {
+                if (list_empty(&c->btree_cache))
+                        goto out;
                b = list_first_entry(&c->btree_cache, struct btree, list);
                list_rotate_left(&c->btree_cache);
@@ -767,6 +735,8 @@ void bch_btree_cache_free(struct cache_set *c)
 #ifdef CONFIG_BCACHE_DEBUG
        if (c->verify_data)
                list_move(&c->verify_data->list, &c->btree_cache);
+        free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c)));
 #endif
        list_splice(&c->btree_cache_freeable,
@@ -807,10 +777,13 @@ int bch_btree_cache_alloc(struct cache_set *c)
 #ifdef CONFIG_BCACHE_DEBUG
        mutex_init(&c->verify_lock);
+        c->verify_ondisk = (void *)
+                __get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c)));
        c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
        if (c->verify_data &&
-            c->verify_data->sets[0].data)
+            c->verify_data->keys.set->data)
                list_del_init(&c->verify_data->list);
        else
                c->verify_data = NULL;
@@ -908,7 +881,7 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
        list_for_each_entry(b, &c->btree_cache_freed, list)
                if (!mca_reap(b, 0, false)) {
                        mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
-                        if (!b->sets[0].data)
+                        if (!b->keys.set[0].data)
                                goto err;
                        else
                                goto out;
@@ -919,10 +892,10 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
                goto err;
        BUG_ON(!down_write_trylock(&b->lock));
-        if (!b->sets->data)
+        if (!b->keys.set->data)
                goto err;
 out:
-        BUG_ON(!closure_is_unlocked(&b->io.cl));
+        BUG_ON(b->io_mutex.count != 1);
        bkey_copy(&b->key, k);
        list_move(&b->list, &c->btree_cache);
@@ -930,10 +903,17 @@ out:
        hlist_add_head_rcu(&b->hash, mca_hash(c, k));
        lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
-        b->level        = level;
        b->parent       = (void *) ~0UL;
+        b->flags        = 0;
+        b->written      = 0;
+        b->level        = level;
-        mca_reinit(b);
+        if (!b->level)
+                bch_btree_keys_init(&b->keys, &bch_extent_keys_ops,
+                                    &b->c->expensive_debug_checks);
+        else
+                bch_btree_keys_init(&b->keys, &bch_btree_keys_ops,
+                                    &b->c->expensive_debug_checks);
        return b;
 err:
@@ -994,13 +974,13 @@ retry:
        b->accessed = 1;
-        for (; i <= b->nsets && b->sets[i].size; i++) {
+        for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
-                prefetch(b->sets[i].tree);
+                prefetch(b->keys.set[i].tree);
-                prefetch(b->sets[i].data);
+                prefetch(b->keys.set[i].data);
        }
-        for (; i <= b->nsets; i++)
+        for (; i <= b->keys.nsets; i++)
-                prefetch(b->sets[i].data);
+                prefetch(b->keys.set[i].data);
        if (btree_node_io_error(b)) {
                rw_unlock(write, b);
@@ -1063,7 +1043,7 @@ struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait)
        mutex_lock(&c->bucket_lock);
 retry:
-        if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, wait))
+        if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait))
                goto err;
        bkey_put(c, &k.key);
@@ -1080,7 +1060,7 @@ retry:
        }
        b->accessed = 1;
-        bch_bset_init_next(b);
+        bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb));
        mutex_unlock(&c->bucket_lock);
@@ -1098,8 +1078,10 @@ err:
 static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait)
 {
        struct btree *n = bch_btree_node_alloc(b->c, b->level, wait);
-        if (!IS_ERR_OR_NULL(n))
+        if (!IS_ERR_OR_NULL(n)) {
-                bch_btree_sort_into(b, n);
+                bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
+                bkey_copy_key(&n->key, &b->key);
+        }
        return n;
 }
@@ -1120,6 +1102,28 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k)
        atomic_inc(&b->c->prio_blocked);
 }
+static int btree_check_reserve(struct btree *b, struct btree_op *op)
+{
+        struct cache_set *c = b->c;
+        struct cache *ca;
+        unsigned i, reserve = c->root->level * 2 + 1;
+        int ret = 0;
+        mutex_lock(&c->bucket_lock);
+        for_each_cache(ca, c, i)
+                if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
+                        if (op)
+                                prepare_to_wait(&c->bucket_wait, &op->wait,
+                                                TASK_UNINTERRUPTIBLE);
+                        ret = -EINTR;
+                        break;
+                }
+        mutex_unlock(&c->bucket_lock);
+        return ret;
+}
 /* Garbage collection */
 uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
@@ -1163,7 +1167,7 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
                /* guard against overflow */
                SET_GC_SECTORS_USED(g, min_t(unsigned,
                                             GC_SECTORS_USED(g) + KEY_SIZE(k),
-                                             (1 << 14) - 1));
+                                             MAX_GC_SECTORS_USED));
                BUG_ON(!GC_SECTORS_USED(g));
        }
@@ -1183,11 +1187,11 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
        gc->nodes++;
-        for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
+        for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
                stale = max(stale, btree_mark_key(b, k));
                keys++;
-                if (bch_ptr_bad(b, k))
+                if (bch_ptr_bad(&b->keys, k))
                        continue;
                gc->key_bytes += bkey_u64s(k);
@@ -1197,9 +1201,9 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
                gc->data += KEY_SIZE(k);
        }
-        for (t = b->sets; t <= &b->sets[b->nsets]; t++)
+        for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++)
                btree_bug_on(t->size &&
-                             bset_written(b, t) &&
+                             bset_written(&b->keys, t) &&
                             bkey_cmp(&b->key, &t->end) < 0,
                             b, "found short btree key in gc");
@@ -1243,7 +1247,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
        blocks = btree_default_blocks(b->c) * 2 / 3;
        if (nodes < 2 ||
-            __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1))
+            __set_blocks(b->keys.set[0].data, keys,
+                         block_bytes(b->c)) > blocks * (nodes - 1))
                return 0;
        for (i = 0; i < nodes; i++) {
@@ -1253,18 +1258,19 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
        }
        for (i = nodes - 1; i > 0; --i) {
-                struct bset *n1 = new_nodes[i]->sets->data;
+                struct bset *n1 = btree_bset_first(new_nodes[i]);
-                struct bset *n2 = new_nodes[i - 1]->sets->data;
+                struct bset *n2 = btree_bset_first(new_nodes[i - 1]);
                struct bkey *k, *last = NULL;
                keys = 0;
                if (i > 1) {
                        for (k = n2->start;
-                             k < end(n2);
+                             k < bset_bkey_last(n2);
                             k = bkey_next(k)) {
                                if (__set_blocks(n1, n1->keys + keys +
-                                                 bkey_u64s(k), b->c) > blocks)
+                                                 bkey_u64s(k),
+                                                 block_bytes(b->c)) > blocks)
                                        break;
                                last = k;
@@ -1280,7 +1286,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
                         * though)
                         */
                        if (__set_blocks(n1, n1->keys + n2->keys,
-                                         b->c) > btree_blocks(new_nodes[i]))
+                                         block_bytes(b->c)) >
+                            btree_blocks(new_nodes[i]))
                                goto out_nocoalesce;
                        keys = n2->keys;
@@ -1288,27 +1295,28 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
                        last = &r->b->key;
                }
-                BUG_ON(__set_blocks(n1, n1->keys + keys,
+                BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) >
-                                    b->c) > btree_blocks(new_nodes[i]));
+                       btree_blocks(new_nodes[i]));
                if (last)
                        bkey_copy_key(&new_nodes[i]->key, last);
-                memcpy(end(n1),
+                memcpy(bset_bkey_last(n1),
                       n2->start,
-                       (void *) node(n2, keys) - (void *) n2->start);
+                       (void *) bset_bkey_idx(n2, keys) - (void *) n2->start);
                n1->keys += keys;
                r[i].keys = n1->keys;
                memmove(n2->start,
-                        node(n2, keys),
+                        bset_bkey_idx(n2, keys),
-                        (void *) end(n2) - (void *) node(n2, keys));
+                        (void *) bset_bkey_last(n2) -
+                        (void *) bset_bkey_idx(n2, keys));
                n2->keys -= keys;
-                if (bch_keylist_realloc(keylist,
+                if (__bch_keylist_realloc(keylist,
-                                        KEY_PTRS(&new_nodes[i]->key), b->c))
+                                          bkey_u64s(&new_nodes[i]->key)))
                        goto out_nocoalesce;
                bch_btree_node_write(new_nodes[i], &cl);
@@ -1316,7 +1324,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
        }
        for (i = 0; i < nodes; i++) {
-                if (bch_keylist_realloc(keylist, KEY_PTRS(&r[i].b->key), b->c))
+                if (__bch_keylist_realloc(keylist, bkey_u64s(&r[i].b->key)))
                        goto out_nocoalesce;
                make_btree_freeing_key(r[i].b, keylist->top);
@@ -1324,7 +1332,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
        }
        /* We emptied out this node */
-        BUG_ON(new_nodes[0]->sets->data->keys);
+        BUG_ON(btree_bset_first(new_nodes[0])->keys);
        btree_node_free(new_nodes[0]);
        rw_unlock(true, new_nodes[0]);
@@ -1370,7 +1378,7 @@ static unsigned btree_gc_count_keys(struct btree *b)
        struct btree_iter iter;
        unsigned ret = 0;
-        for_each_key_filter(b, k, &iter, bch_ptr_bad)
+        for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
                ret += bkey_u64s(k);
        return ret;
@@ -1390,13 +1398,13 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
        struct gc_merge_info *last = r + GC_MERGE_NODES - 1;
        bch_keylist_init(&keys);
-        bch_btree_iter_init(b, &iter, &b->c->gc_done);
+        bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done);
        for (i = 0; i < GC_MERGE_NODES; i++)
                r[i].b = ERR_PTR(-EINTR);
        while (1) {
-                k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
+                k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
                if (k) {
                        r->b = bch_btree_node_get(b->c, k, b->level - 1, true);
                        if (IS_ERR(r->b)) {
@@ -1416,7 +1424,8 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
                if (!IS_ERR(last->b)) {
                        should_rewrite = btree_gc_mark_node(last->b, gc);
-                        if (should_rewrite) {
+                        if (should_rewrite &&
+                            !btree_check_reserve(b, NULL)) {
                                n = btree_node_alloc_replacement(last->b,
                                                                 false);
@@ -1705,7 +1714,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
        struct bucket *g;
        struct btree_iter iter;
-        for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
+        for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
                for (i = 0; i < KEY_PTRS(k); i++) {
                        if (!ptr_available(b->c, k, i))
                                continue;
@@ -1728,10 +1737,11 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
        }
        if (b->level) {
-                bch_btree_iter_init(b, &iter, NULL);
+                bch_btree_iter_init(&b->keys, &iter, NULL);
                do {
-                        k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
+                        k = bch_btree_iter_next_filter(&iter, &b->keys,
+                                                       bch_ptr_bad);
                        if (k)
                                btree_node_prefetch(b->c, k, b->level - 1);
@@ -1774,235 +1784,36 @@ err:
 /* Btree insertion */
-static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert)
+static bool btree_insert_key(struct btree *b, struct bkey *k,
-{
+                             struct bkey *replace_key)
-        struct bset *i = b->sets[b->nsets].data;
-        memmove((uint64_t *) where + bkey_u64s(insert),
-                where,
-                (void *) end(i) - (void *) where);
-        i->keys += bkey_u64s(insert);
-        bkey_copy(where, insert);
-        bch_bset_fix_lookup_table(b, where);
-}
-static bool fix_overlapping_extents(struct btree *b, struct bkey *insert,
-                                    struct btree_iter *iter,
-                                    struct bkey *replace_key)
 {
-        void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
+        unsigned status;
-        {
-                if (KEY_DIRTY(k))
-                        bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
-                                                     offset, -sectors);
-        }
-        uint64_t old_offset;
-        unsigned old_size, sectors_found = 0;
-        while (1) {
-                struct bkey *k = bch_btree_iter_next(iter);
-                if (!k ||
-                    bkey_cmp(&START_KEY(k), insert) >= 0)
-                        break;
-                if (bkey_cmp(k, &START_KEY(insert)) <= 0)
-                        continue;
-                old_offset = KEY_START(k);
-                old_size = KEY_SIZE(k);
-                /*
-                 * We might overlap with 0 size extents; we can't skip these
-                 * because if they're in the set we're inserting to we have to
-                 * adjust them so they don't overlap with the key we're
-                 * inserting. But we don't want to check them for replace
-                 * operations.
-                 */
-                if (replace_key && KEY_SIZE(k)) {
-                        /*
-                         * k might have been split since we inserted/found the
-                         * key we're replacing
-                         */
-                        unsigned i;
-                        uint64_t offset = KEY_START(k) -
-                                KEY_START(replace_key);
-                        /* But it must be a subset of the replace key */
-                        if (KEY_START(k) < KEY_START(replace_key) ||
-                            KEY_OFFSET(k) > KEY_OFFSET(replace_key))
-                                goto check_failed;
-                        /* We didn't find a key that we were supposed to */
-                        if (KEY_START(k) > KEY_START(insert) + sectors_found)
-                                goto check_failed;
-                        if (KEY_PTRS(k) != KEY_PTRS(replace_key) ||
-                            KEY_DIRTY(k) != KEY_DIRTY(replace_key))
-                                goto check_failed;
-                        /* skip past gen */
-                        offset <<= 8;
-                        BUG_ON(!KEY_PTRS(replace_key));
-                        for (i = 0; i < KEY_PTRS(replace_key); i++)
+        BUG_ON(bkey_cmp(k, &b->key) > 0);
-                                if (k->ptr[i] != replace_key->ptr[i] + offset)
-                                        goto check_failed;
-                        sectors_found = KEY_OFFSET(k) - KEY_START(insert);
-                }
-                if (bkey_cmp(insert, k) < 0 &&
-                    bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
-                        /*
-                         * We overlapped in the middle of an existing key: that
-                         * means we have to split the old key. But we have to do
-                         * slightly different things depending on whether the
-                         * old key has been written out yet.
-                         */
-                        struct bkey *top;
-                        subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
-                        if (bkey_written(b, k)) {
-                                /*
-                                 * We insert a new key to cover the top of the
-                                 * old key, and the old key is modified in place
-                                 * to represent the bottom split.
-                                 *
-                                 * It's completely arbitrary whether the new key
-                                 * is the top or the bottom, but it has to match
-                                 * up with what btree_sort_fixup() does - it
-                                 * doesn't check for this kind of overlap, it
-                                 * depends on us inserting a new key for the top
-                                 * here.
-                                 */
-                                top = bch_bset_search(b, &b->sets[b->nsets],
-                                                      insert);
-                                shift_keys(b, top, k);
-                        } else {
-                                BKEY_PADDED(key) temp;
-                                bkey_copy(&temp.key, k);
-                                shift_keys(b, k, &temp.key);
-                                top = bkey_next(k);
-                        }
-                        bch_cut_front(insert, top);
-                        bch_cut_back(&START_KEY(insert), k);
-                        bch_bset_fix_invalidated_key(b, k);
-                        return false;
-                }
-                if (bkey_cmp(insert, k) < 0) {
-                        bch_cut_front(insert, k);
-                } else {
-                        if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
-                                old_offset = KEY_START(insert);
-                        if (bkey_written(b, k) &&
-                            bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
-                                /*
-                                 * Completely overwrote, so we don't have to
-                                 * invalidate the binary search tree
-                                 */
-                                bch_cut_front(k, k);
-                        } else {
-                                __bch_cut_back(&START_KEY(insert), k);
-                                bch_bset_fix_invalidated_key(b, k);
-                        }
-                }
-                subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
-        }
-check_failed:
+        status = bch_btree_insert_key(&b->keys, k, replace_key);
-        if (replace_key) {
+        if (status != BTREE_INSERT_STATUS_NO_INSERT) {
-                if (!sectors_found) {
+                bch_check_keys(&b->keys, "%u for %s", status,
-                        return true;
+                               replace_key ? "replace" : "insert");
-                } else if (sectors_found < KEY_SIZE(insert)) {
-                        SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
-                                       (KEY_SIZE(insert) - sectors_found));
-                        SET_KEY_SIZE(insert, sectors_found);
-                }
-        }
-        return false;
+                trace_bcache_btree_insert_key(b, k, replace_key != NULL,
+                                              status);
+                return true;
+        } else
+                return false;
 }
-static bool btree_insert_key(struct btree *b, struct btree_op *op,
+static size_t insert_u64s_remaining(struct btree *b)
-                             struct bkey *k, struct bkey *replace_key)
 {
-        struct bset *i = b->sets[b->nsets].data;
+        long ret = bch_btree_keys_u64s_remaining(&b->keys);
-        struct bkey *m, *prev;
-        unsigned status = BTREE_INSERT_STATUS_INSERT;
-        BUG_ON(bkey_cmp(k, &b->key) > 0);
-        BUG_ON(b->level && !KEY_PTRS(k));
-        BUG_ON(!b->level && !KEY_OFFSET(k));
-        if (!b->level) {
-                struct btree_iter iter;
-                /*
-                 * bset_search() returns the first key that is strictly greater
-                 * than the search key - but for back merging, we want to find
-                 * the previous key.
-                 */
-                prev = NULL;
-                m = bch_btree_iter_init(b, &iter, PRECEDING_KEY(&START_KEY(k)));
-                if (fix_overlapping_extents(b, k, &iter, replace_key)) {
+        /*
-                        op->insert_collision = true;
+         * Might land in the middle of an existing extent and have to split it
-                        return false;
+         */
-                }
+        if (b->keys.ops->is_extents)
+                ret -= KEY_MAX_U64S;
-                if (KEY_DIRTY(k))
-                        bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
-                                                     KEY_START(k), KEY_SIZE(k));
-                while (m != end(i) &&
-                       bkey_cmp(k, &START_KEY(m)) > 0)
-                        prev = m, m = bkey_next(m);
-                if (key_merging_disabled(b->c))
-                        goto insert;
-                /* prev is in the tree, if we merge we're done */
-                status = BTREE_INSERT_STATUS_BACK_MERGE;
-                if (prev &&
-                    bch_bkey_try_merge(b, prev, k))
-                        goto merged;
-                status = BTREE_INSERT_STATUS_OVERWROTE;
-                if (m != end(i) &&
-                    KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
-                        goto copy;
-                status = BTREE_INSERT_STATUS_FRONT_MERGE;
-                if (m != end(i) &&
-                    bch_bkey_try_merge(b, k, m))
-                        goto copy;
-        } else {
-                BUG_ON(replace_key);
-                m = bch_bset_search(b, &b->sets[b->nsets], k);
-        }
-insert: shift_keys(b, m, k);
-copy:   bkey_copy(m, k);
-merged:
-        bch_check_keys(b, "%u for %s", status,
-                       replace_key ? "replace" : "insert");
-        if (b->level && !KEY_OFFSET(k))
-                btree_current_write(b)->prio_blocked++;
-        trace_bcache_btree_insert_key(b, k, replace_key != NULL, status);
-        return true;
+        return max(ret, 0L);
 }
 static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
@@ -2010,21 +1821,19 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
                                  struct bkey *replace_key)
 {
        bool ret = false;
-        int oldsize = bch_count_data(b);
+        int oldsize = bch_count_data(&b->keys);
        while (!bch_keylist_empty(insert_keys)) {
-                struct bset *i = write_block(b);
                struct bkey *k = insert_keys->keys;
-                if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c)
+                if (bkey_u64s(k) > insert_u64s_remaining(b))
-                    > btree_blocks(b))
                        break;
                if (bkey_cmp(k, &b->key) <= 0) {
                        if (!b->level)
                                bkey_put(b->c, k);
-                        ret |= btree_insert_key(b, op, k, replace_key);
+                        ret |= btree_insert_key(b, k, replace_key);
                        bch_keylist_pop_front(insert_keys);
                } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) {
                        BKEY_PADDED(key) temp;
@@ -2033,16 +1842,19 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
                        bch_cut_back(&b->key, &temp.key);
                        bch_cut_front(&b->key, insert_keys->keys);
-                        ret |= btree_insert_key(b, op, &temp.key, replace_key);
+                        ret |= btree_insert_key(b, &temp.key, replace_key);
                        break;
                } else {
                        break;
                }
        }
+        if (!ret)
+                op->insert_collision = true;
        BUG_ON(!bch_keylist_empty(insert_keys) && b->level);
-        BUG_ON(bch_count_data(b) < oldsize);
+        BUG_ON(bch_count_data(&b->keys) < oldsize);
        return ret;
 }
@@ -2059,16 +1871,21 @@ static int btree_split(struct btree *b, struct btree_op *op,
        closure_init_stack(&cl);
        bch_keylist_init(&parent_keys);
+        if (!b->level &&
+            btree_check_reserve(b, op))
+                return -EINTR;
        n1 = btree_node_alloc_replacement(b, true);
        if (IS_ERR(n1))
                goto err;
-        split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5;
+        split = set_blocks(btree_bset_first(n1),
+                           block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5;
        if (split) {
                unsigned keys = 0;
-                trace_bcache_btree_node_split(b, n1->sets[0].data->keys);
+                trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys);
                n2 = bch_btree_node_alloc(b->c, b->level, true);
                if (IS_ERR(n2))
@@ -2087,18 +1904,20 @@ static int btree_split(struct btree *b, struct btree_op *op,
                 * search tree yet
                 */
-                while (keys < (n1->sets[0].data->keys * 3) / 5)
+                while (keys < (btree_bset_first(n1)->keys * 3) / 5)
-                        keys += bkey_u64s(node(n1->sets[0].data, keys));
+                        keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1),
+                                                        keys));
-                bkey_copy_key(&n1->key, node(n1->sets[0].data, keys));
+                bkey_copy_key(&n1->key,
-                keys += bkey_u64s(node(n1->sets[0].data, keys));
+                              bset_bkey_idx(btree_bset_first(n1), keys));
+                keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys));
-                n2->sets[0].data->keys = n1->sets[0].data->keys - keys;
+                btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys;
-                n1->sets[0].data->keys = keys;
+                btree_bset_first(n1)->keys = keys;
-                memcpy(n2->sets[0].data->start,
+                memcpy(btree_bset_first(n2)->start,
-                       end(n1->sets[0].data),
+                       bset_bkey_last(btree_bset_first(n1)),
-                       n2->sets[0].data->keys * sizeof(uint64_t));
+                       btree_bset_first(n2)->keys * sizeof(uint64_t));
                bkey_copy_key(&n2->key, &b->key);
@@ -2106,7 +1925,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
                bch_btree_node_write(n2, &cl);
                rw_unlock(true, n2);
        } else {
-                trace_bcache_btree_node_compact(b, n1->sets[0].data->keys);
+                trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys);
                bch_btree_insert_keys(n1, op, insert_keys, replace_key);
        }
@@ -2149,18 +1968,21 @@ static int btree_split(struct btree *b, struct btree_op *op,
        return 0;
 err_free2:
+        bkey_put(b->c, &n2->key);
        btree_node_free(n2);
        rw_unlock(true, n2);
 err_free1:
+        bkey_put(b->c, &n1->key);
        btree_node_free(n1);
        rw_unlock(true, n1);
 err:
+        WARN(1, "bcache: btree split failed");
        if (n3 == ERR_PTR(-EAGAIN) ||
            n2 == ERR_PTR(-EAGAIN) ||
            n1 == ERR_PTR(-EAGAIN))
                return -EAGAIN;
-        pr_warn("couldn't split");
        return -ENOMEM;
 }
@@ -2171,7 +1993,7 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
 {
        BUG_ON(b->level && replace_key);
-        if (should_split(b)) {
+        if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) {
                if (current->bio_list) {
                        op->lock = b->c->root->level + 1;
                        return -EAGAIN;
@@ -2180,11 +2002,13 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
                        return -EINTR;
                } else {
                        /* Invalidated all iterators */
-                        return btree_split(b, op, insert_keys, replace_key) ?:
+                        int ret = btree_split(b, op, insert_keys, replace_key);
-                                -EINTR;
+                        return bch_keylist_empty(insert_keys) ?
+                                0 : ret ?: -EINTR;
                }
        } else {
-                BUG_ON(write_block(b) != b->sets[b->nsets].data);
+                BUG_ON(write_block(b) != btree_bset_last(b));
                if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
                        if (!b->level)
@@ -2323,9 +2147,9 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
                struct bkey *k;
                struct btree_iter iter;
-                bch_btree_iter_init(b, &iter, from);
+                bch_btree_iter_init(&b->keys, &iter, from);
-                while ((k = bch_btree_iter_next_filter(&iter, b,
+                while ((k = bch_btree_iter_next_filter(&iter, &b->keys,
                                                       bch_ptr_bad))) {
                        ret = btree(map_nodes_recurse, k, b,
                                    op, from, fn, flags);
@@ -2356,9 +2180,9 @@ static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
        struct bkey *k;
        struct btree_iter iter;
-        bch_btree_iter_init(b, &iter, from);
+        bch_btree_iter_init(&b->keys, &iter, from);
-        while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) {
+        while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) {
                ret = !b->level
                        ? fn(op, b, k)
                        : btree(map_keys_recurse, k, b, op, from, fn, flags);
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 767e75570896..af065e97e55c 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -130,20 +130,12 @@ struct btree {
        unsigned long           flags;
        uint16_t                written;        /* would be nice to kill */
        uint8_t                 level;
-        uint8_t                 nsets;
-        uint8_t                 page_order;
+        struct btree_keys       keys;
-        /*
-         * Set of sorted keys - the real btree node - plus a binary search tree
-         *
-         * sets[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
-         * to the memory we have allocated for this btree node. Additionally,
-         * set[0]->data points to the entire btree node as it exists on disk.
-         */
-        struct bset_tree        sets[MAX_BSETS];
        /* For outstanding btree writes, used as a lock - protects write_idx */
-        struct closure_with_waitlist    io;
+        struct closure          io;
+        struct semaphore        io_mutex;
        struct list_head        list;
        struct delayed_work     work;
@@ -179,24 +171,19 @@ static inline struct btree_write *btree_prev_write(struct btree *b)
        return b->writes + (btree_node_write_idx(b) ^ 1);
 }
-static inline unsigned bset_offset(struct btree *b, struct bset *i)
+static inline struct bset *btree_bset_first(struct btree *b)
 {
-        return (((size_t) i) - ((size_t) b->sets->data)) >> 9;
+        return b->keys.set->data;
 }
-static inline struct bset *write_block(struct btree *b)
+static inline struct bset *btree_bset_last(struct btree *b)
 {
-        return ((void *) b->sets[0].data) + b->written * block_bytes(b->c);
+        return bset_tree_last(&b->keys)->data;
 }
-static inline bool bset_written(struct btree *b, struct bset_tree *t)
+static inline unsigned bset_block_offset(struct btree *b, struct bset *i)
 {
-        return t->data < write_block(b);
+        return bset_sector_offset(&b->keys, i) >> b->c->block_bits;
-}
-static inline bool bkey_written(struct btree *b, struct bkey *k)
-{
-        return k < write_block(b)->start;
 }
 static inline void set_gc_sectors(struct cache_set *c)
@@ -204,21 +191,6 @@ static inline void set_gc_sectors(struct cache_set *c)
        atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16);
 }
-static inline struct bkey *bch_btree_iter_init(struct btree *b,
-                                               struct btree_iter *iter,
-                                               struct bkey *search)
-{
-        return __bch_btree_iter_init(b, iter, search, b->sets);
-}
-static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k)
-{
-        if (b->level)
-                return bch_btree_ptr_invalid(b->c, k);
-        else
-                return bch_extent_ptr_invalid(b->c, k);
-}
 void bkey_put(struct cache_set *c, struct bkey *k);
 /* Looping macros */
@@ -229,17 +201,12 @@ void bkey_put(struct cache_set *c, struct bkey *k);
             iter++)                                                    \
                hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash)
-#define for_each_key_filter(b, k, iter, filter)                         \
-        for (bch_btree_iter_init((b), (iter), NULL);                    \
-             ((k) = bch_btree_iter_next_filter((iter), b, filter));)
-#define for_each_key(b, k, iter)                                        \
-        for (bch_btree_iter_init((b), (iter), NULL);                    \
-             ((k) = bch_btree_iter_next(iter));)
 /* Recursing down the btree */
 struct btree_op {
+        /* for waiting on btree reserve in btree_split() */
+        wait_queue_t            wait;
        /* Btree level at which we start taking write locks */
        short                   lock;
@@ -249,6 +216,7 @@ struct btree_op {
 static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level)
 {
        memset(op, 0, sizeof(struct btree_op));
+        init_wait(&op->wait);
        op->lock = write_lock_level;
 }
@@ -267,7 +235,7 @@ static inline void rw_unlock(bool w, struct btree *b)
        (w ? up_write : up_read)(&b->lock);
 }
-void bch_btree_node_read(struct btree *);
+void bch_btree_node_read_done(struct btree *);
 void bch_btree_node_write(struct btree *, struct closure *);
 void bch_btree_set_root(struct btree *);
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index dfff2410322e..7a228de95fd7 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -11,19 +11,6 @@
 #include "closure.h"
-#define CL_FIELD(type, field)                                   \
-        case TYPE_ ## type:                                     \
-        return &container_of(cl, struct type, cl)->field
-static struct closure_waitlist *closure_waitlist(struct closure *cl)
-{
-        switch (cl->type) {
-                CL_FIELD(closure_with_waitlist, wait);
-        default:
-                return NULL;
-        }
-}
 static inline void closure_put_after_sub(struct closure *cl, int flags)
 {
        int r = flags & CLOSURE_REMAINING_MASK;
@@ -42,17 +29,10 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
                        closure_queue(cl);
                } else {
                        struct closure *parent = cl->parent;
-                        struct closure_waitlist *wait = closure_waitlist(cl);
                        closure_fn *destructor = cl->fn;
                        closure_debug_destroy(cl);
-                        smp_mb();
-                        atomic_set(&cl->remaining, -1);
-                        if (wait)
-                                closure_wake_up(wait);
                        if (destructor)
                                destructor(cl);
@@ -69,19 +49,18 @@ void closure_sub(struct closure *cl, int v)
 }
 EXPORT_SYMBOL(closure_sub);
+/**
+ * closure_put - decrement a closure's refcount
+ */
 void closure_put(struct closure *cl)
 {
        closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
 }
 EXPORT_SYMBOL(closure_put);
-static void set_waiting(struct closure *cl, unsigned long f)
+/**
-{
+ * closure_wake_up - wake up all closures on a wait list, without memory barrier
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+ */
-        cl->waiting_on = f;
-#endif
-}
 void __closure_wake_up(struct closure_waitlist *wait_list)
 {
        struct llist_node *list;
@@ -106,27 +85,34 @@ void __closure_wake_up(struct closure_waitlist *wait_list)
                cl = container_of(reverse, struct closure, list);
                reverse = llist_next(reverse);
-                set_waiting(cl, 0);
+                closure_set_waiting(cl, 0);
                closure_sub(cl, CLOSURE_WAITING + 1);
        }
 }
 EXPORT_SYMBOL(__closure_wake_up);
-bool closure_wait(struct closure_waitlist *list, struct closure *cl)
+/**
+ * closure_wait - add a closure to a waitlist
+ *
+ * @waitlist will own a ref on @cl, which will be released when
+ * closure_wake_up() is called on @waitlist.
+ *
+ */
+bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
 {
        if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
                return false;
-        set_waiting(cl, _RET_IP_);
+        closure_set_waiting(cl, _RET_IP_);
        atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
-        llist_add(&cl->list, &list->list);
+        llist_add(&cl->list, &waitlist->list);
        return true;
 }
 EXPORT_SYMBOL(closure_wait);
 /**
- * closure_sync() - sleep until a closure a closure has nothing left to wait on
+ * closure_sync - sleep until a closure a closure has nothing left to wait on
 *
 * Sleeps until the refcount hits 1 - the thread that's running the closure owns
 * the last refcount.
@@ -148,46 +134,6 @@ void closure_sync(struct closure *cl)
 }
 EXPORT_SYMBOL(closure_sync);
-/**
- * closure_trylock() - try to acquire the closure, without waiting
- * @cl:         closure to lock
- *
- * Returns true if the closure was succesfully locked.
- */
-bool closure_trylock(struct closure *cl, struct closure *parent)
-{
-        if (atomic_cmpxchg(&cl->remaining, -1,
-                           CLOSURE_REMAINING_INITIALIZER) != -1)
-                return false;
-        smp_mb();
-        cl->parent = parent;
-        if (parent)
-                closure_get(parent);
-        closure_set_ret_ip(cl);
-        closure_debug_create(cl);
-        return true;
-}
-EXPORT_SYMBOL(closure_trylock);
-void __closure_lock(struct closure *cl, struct closure *parent,
-                    struct closure_waitlist *wait_list)
-{
-        struct closure wait;
-        closure_init_stack(&wait);
-        while (1) {
-                if (closure_trylock(cl, parent))
-                        return;
-                closure_wait_event(wait_list, &wait,
-                                   atomic_read(&cl->remaining) == -1);
-        }
-}
-EXPORT_SYMBOL(__closure_lock);
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
 static LIST_HEAD(closure_list);
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 9762f1be3304..7ef7461912be 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -72,30 +72,6 @@
 * closure - _always_ use continue_at(). Doing so consistently will help
 * eliminate an entire class of particularly pernicious races.
 *
- * For a closure to wait on an arbitrary event, we need to introduce waitlists:
- *
- * struct closure_waitlist list;
- * closure_wait_event(list, cl, condition);
- * closure_wake_up(wait_list);
- *
- * These work analagously to wait_event() and wake_up() - except that instead of
- * operating on the current thread (for wait_event()) and lists of threads, they
- * operate on an explicit closure and lists of closures.
- *
- * Because it's a closure we can now wait either synchronously or
- * asynchronously. closure_wait_event() returns the current value of the
- * condition, and if it returned false continue_at() or closure_sync() can be
- * used to wait for it to become true.
- *
- * It's useful for waiting on things when you can't sleep in the context in
- * which you must check the condition (perhaps a spinlock held, or you might be
- * beneath generic_make_request() - in which case you can't sleep on IO).
- *
- * closure_wait_event() will wait either synchronously or asynchronously,
- * depending on whether the closure is in blocking mode or not. You can pick a
- * mode explicitly with closure_wait_event_sync() and
- * closure_wait_event_async(), which do just what you might expect.
- *
 * Lastly, you might have a wait list dedicated to a specific event, and have no
 * need for specifying the condition - you just want to wait until someone runs
 * closure_wake_up() on the appropriate wait list. In that case, just use
@@ -121,40 +97,6 @@
 * All this implies that a closure should typically be embedded in a particular
 * struct (which its refcount will normally control the lifetime of), and that
 * struct can very much be thought of as a stack frame.
- *
- * Locking:
- *
- * Closures are based on work items but they can be thought of as more like
- * threads - in that like threads and unlike work items they have a well
- * defined lifetime; they are created (with closure_init()) and eventually
- * complete after a continue_at(cl, NULL, NULL).
- *
- * Suppose you've got some larger structure with a closure embedded in it that's
- * used for periodically doing garbage collection. You only want one garbage
- * collection happening at a time, so the natural thing to do is protect it with
- * a lock. However, it's difficult to use a lock protecting a closure correctly
- * because the unlock should come after the last continue_to() (additionally, if
- * you're using the closure asynchronously a mutex won't work since a mutex has
- * to be unlocked by the same process that locked it).
- *
- * So to make it less error prone and more efficient, we also have the ability
- * to use closures as locks:
- *
- * closure_init_unlocked();
- * closure_trylock();
- *
- * That's all we need for trylock() - the last closure_put() implicitly unlocks
- * it for you.  But for closure_lock(), we also need a wait list:
- *
- * struct closure_with_waitlist frobnicator_cl;
- *
- * closure_init_unlocked(&frobnicator_cl);
- * closure_lock(&frobnicator_cl);
- *
- * A closure_with_waitlist embeds a closure and a wait list - much like struct
- * delayed_work embeds a work item and a timer_list. The important thing is, use
- * it exactly like you would a regular closure and closure_put() will magically
- * handle everything for you.
 */
 struct closure;
@@ -164,12 +106,6 @@ struct closure_waitlist {
        struct llist_head       list;
 };
-enum closure_type {
-        TYPE_closure                            = 0,
-        TYPE_closure_with_waitlist              = 1,
-        MAX_CLOSURE_TYPE                        = 1,
-};
 enum closure_state {
        /*
         * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
@@ -224,8 +160,6 @@ struct closure {
        atomic_t                remaining;
-        enum closure_type       type;
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
 #define CLOSURE_MAGIC_DEAD      0xc054dead
 #define CLOSURE_MAGIC_ALIVE     0xc054a11e
@@ -237,34 +171,12 @@ struct closure {
 #endif
 };
-struct closure_with_waitlist {
-        struct closure          cl;
-        struct closure_waitlist wait;
-};
-extern unsigned invalid_closure_type(void);
-#define __CLOSURE_TYPE(cl, _t)                                          \
-          __builtin_types_compatible_p(typeof(cl), struct _t)           \
-                ? TYPE_ ## _t :                                         \
-#define __closure_type(cl)                                              \
-(                                                                       \
-        __CLOSURE_TYPE(cl, closure)                                     \
-        __CLOSURE_TYPE(cl, closure_with_waitlist)                       \
-        invalid_closure_type()                                          \
-)
 void closure_sub(struct closure *cl, int v);
 void closure_put(struct closure *cl);
 void __closure_wake_up(struct closure_waitlist *list);
 bool closure_wait(struct closure_waitlist *list, struct closure *cl);
 void closure_sync(struct closure *cl);
-bool closure_trylock(struct closure *cl, struct closure *parent);
-void __closure_lock(struct closure *cl, struct closure *parent,
-                    struct closure_waitlist *wait_list);
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
 void closure_debug_init(void);
@@ -293,134 +205,97 @@ static inline void closure_set_ret_ip(struct closure *cl)
 #endif
 }
-static inline void closure_get(struct closure *cl)
+static inline void closure_set_waiting(struct closure *cl, unsigned long f)
 {
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
-        BUG_ON((atomic_inc_return(&cl->remaining) &
+        cl->waiting_on = f;
-                CLOSURE_REMAINING_MASK) <= 1);
-#else
-        atomic_inc(&cl->remaining);
 #endif
 }
-static inline void closure_set_stopped(struct closure *cl)
+static inline void __closure_end_sleep(struct closure *cl)
 {
-        atomic_sub(CLOSURE_RUNNING, &cl->remaining);
+        __set_current_state(TASK_RUNNING);
+        if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING)
+                atomic_sub(CLOSURE_SLEEPING, &cl->remaining);
 }
-static inline bool closure_is_unlocked(struct closure *cl)
+static inline void __closure_start_sleep(struct closure *cl)
 {
-        return atomic_read(&cl->remaining) == -1;
+        closure_set_ip(cl);
+        cl->task = current;
+        set_current_state(TASK_UNINTERRUPTIBLE);
+        if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
+                atomic_add(CLOSURE_SLEEPING, &cl->remaining);
 }
-static inline void do_closure_init(struct closure *cl, struct closure *parent,
+static inline void closure_set_stopped(struct closure *cl)
-                                   bool running)
 {
-        cl->parent = parent;
+        atomic_sub(CLOSURE_RUNNING, &cl->remaining);
-        if (parent)
+}
-                closure_get(parent);
-        if (running) {
-                closure_debug_create(cl);
-                atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
-        } else
-                atomic_set(&cl->remaining, -1);
+static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
+                                  struct workqueue_struct *wq)
+{
+        BUG_ON(object_is_on_stack(cl));
        closure_set_ip(cl);
+        cl->fn = fn;
+        cl->wq = wq;
+        /* between atomic_dec() in closure_put() */
+        smp_mb__before_atomic_dec();
 }
-/*
+static inline void closure_queue(struct closure *cl)
- * Hack to get at the embedded closure if there is one, by doing an unsafe cast:
+{
- * the result of __closure_type() is thrown away, it's used merely for type
+        struct workqueue_struct *wq = cl->wq;
- * checking.
+        if (wq) {
- */
+                INIT_WORK(&cl->work, cl->work.func);
-#define __to_internal_closure(cl)                               \
+                BUG_ON(!queue_work(wq, &cl->work));
-({                                                              \
+        } else
-        BUILD_BUG_ON(__closure_type(*cl) > MAX_CLOSURE_TYPE);   \
+                cl->fn(cl);
-        (struct closure *) cl;                                  \
+}
-})
-#define closure_init_type(cl, parent, running)                  \
-do {                                                            \
-        struct closure *_cl = __to_internal_closure(cl);        \
-        _cl->type = __closure_type(*(cl));                      \
-        do_closure_init(_cl, parent, running);                  \
-} while (0)
 /**
- * __closure_init() - Initialize a closure, skipping the memset()
+ * closure_get - increment a closure's refcount
- *
- * May be used instead of closure_init() when memory has already been zeroed.
 */
-#define __closure_init(cl, parent)                              \
+static inline void closure_get(struct closure *cl)
-        closure_init_type(cl, parent, true)
+{
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+        BUG_ON((atomic_inc_return(&cl->remaining) &
+                CLOSURE_REMAINING_MASK) <= 1);
+#else
+        atomic_inc(&cl->remaining);
+#endif
+}
 /**
- * closure_init() - Initialize a closure, setting the refcount to 1
+ * closure_init - Initialize a closure, setting the refcount to 1
 * @cl:         closure to initialize
 * @parent:     parent of the new closure. cl will take a refcount on it for its
 *              lifetime; may be NULL.
 */
-#define closure_init(cl, parent)                                \
+static inline void closure_init(struct closure *cl, struct closure *parent)
-do {                                                            \
-        memset((cl), 0, sizeof(*(cl)));                         \
-        __closure_init(cl, parent);                             \
-} while (0)
-static inline void closure_init_stack(struct closure *cl)
 {
        memset(cl, 0, sizeof(struct closure));
-        atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK);
+        cl->parent = parent;
-}
+        if (parent)
+                closure_get(parent);
-/**
- * closure_init_unlocked() - Initialize a closure but leave it unlocked.
- * @cl:         closure to initialize
- *
- * For when the closure will be used as a lock. The closure may not be used
- * until after a closure_lock() or closure_trylock().
- */
-#define closure_init_unlocked(cl)                               \
-do {                                                            \
-        memset((cl), 0, sizeof(*(cl)));                         \
-        closure_init_type(cl, NULL, false);                     \
-} while (0)
-/**
- * closure_lock() - lock and initialize a closure.
- * @cl:         the closure to lock
- * @parent:     the new parent for this closure
- *
- * The closure must be of one of the types that has a waitlist (otherwise we
- * wouldn't be able to sleep on contention).
- *
- * @parent has exactly the same meaning as in closure_init(); if non null, the
- * closure will take a reference on @parent which will be released when it is
- * unlocked.
- */
-#define closure_lock(cl, parent)                                \
-        __closure_lock(__to_internal_closure(cl), parent, &(cl)->wait)
-static inline void __closure_end_sleep(struct closure *cl)
+        atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
-{
-        __set_current_state(TASK_RUNNING);
-        if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING)
+        closure_debug_create(cl);
-                atomic_sub(CLOSURE_SLEEPING, &cl->remaining);
+        closure_set_ip(cl);
 }
-static inline void __closure_start_sleep(struct closure *cl)
+static inline void closure_init_stack(struct closure *cl)
 {
-        closure_set_ip(cl);
+        memset(cl, 0, sizeof(struct closure));
-        cl->task = current;
+        atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK);
-        set_current_state(TASK_UNINTERRUPTIBLE);
-        if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
-                atomic_add(CLOSURE_SLEEPING, &cl->remaining);
 }
 /**
- * closure_wake_up() - wake up all closures on a wait list.
+ * closure_wake_up - wake up all closures on a wait list.
 */
 static inline void closure_wake_up(struct closure_waitlist *list)
 {
@@ -428,69 +303,19 @@ static inline void closure_wake_up(struct closure_waitlist *list)
        __closure_wake_up(list);
 }
-/*
+/**
- * Wait on an event, synchronously or asynchronously - analogous to wait_event()
+ * continue_at - jump to another function with barrier
- * but for closures.
+ *
- *
+ * After @cl is no longer waiting on anything (i.e. all outstanding refs have
- * The loop is oddly structured so as to avoid a race; we must check the
+ * been dropped with closure_put()), it will resume execution at @fn running out
- * condition again after we've added ourself to the waitlist. We know if we were
+ * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly).
- * already on the waitlist because closure_wait() returns false; thus, we only
+ *
- * schedule or break if closure_wait() returns false. If it returns true, we
+ * NOTE: This macro expands to a return in the calling function!
- * just loop again - rechecking the condition.
+ *
- *
+ * This is because after calling continue_at() you no longer have a ref on @cl,
- * The __closure_wake_up() is necessary because we may race with the event
+ * and whatever @cl owns may be freed out from under you - a running closure fn
- * becoming true; i.e. we see event false -> wait -> recheck condition, but the
+ * has a ref on its own closure which continue_at() drops.
- * thread that made the event true may have called closure_wake_up() before we
- * added ourself to the wait list.
- *
- * We have to call closure_sync() at the end instead of just
- * __closure_end_sleep() because a different thread might've called
- * closure_wake_up() before us and gotten preempted before they dropped the
- * refcount on our closure. If this was a stack allocated closure, that would be
- * bad.
 */
-#define closure_wait_event(list, cl, condition)                         \
-({                                                                      \
-        typeof(condition) ret;                                          \
-                                                                        \
-        while (1) {                                                     \
-                ret = (condition);                                      \
-                if (ret) {                                              \
-                        __closure_wake_up(list);                        \
-                        closure_sync(cl);                               \
-                        break;                                          \
-                }                                                       \
-                                                                        \
-                __closure_start_sleep(cl);                              \
-                                                                        \
-                if (!closure_wait(list, cl))                            \
-                        schedule();                                     \
-        }                                                               \
-                                                                        \
-        ret;                                                            \
-})
-static inline void closure_queue(struct closure *cl)
-{
-        struct workqueue_struct *wq = cl->wq;
-        if (wq) {
-                INIT_WORK(&cl->work, cl->work.func);
-                BUG_ON(!queue_work(wq, &cl->work));
-        } else
-                cl->fn(cl);
-}
-static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
-                                  struct workqueue_struct *wq)
-{
-        BUG_ON(object_is_on_stack(cl));
-        closure_set_ip(cl);
-        cl->fn = fn;
-        cl->wq = wq;
-        /* between atomic_dec() in closure_put() */
-        smp_mb__before_atomic_dec();
-}
 #define continue_at(_cl, _fn, _wq)                                      \
 do {                                                                    \
        set_closure_fn(_cl, _fn, _wq);                                  \
@@ -498,8 +323,28 @@ do {									\
        return;                                                         \
 } while (0)
+/**
+ * closure_return - finish execution of a closure
+ *
+ * This is used to indicate that @cl is finished: when all outstanding refs on
+ * @cl have been dropped @cl's ref on its parent closure (as passed to
+ * closure_init()) will be dropped, if one was specified - thus this can be
+ * thought of as returning to the parent closure.
+ */
 #define closure_return(_cl)     continue_at((_cl), NULL, NULL)
+/**
+ * continue_at_nobarrier - jump to another function without barrier
+ *
+ * Causes @fn to be executed out of @cl, in @wq context (or called directly if
+ * @wq is NULL).
+ *
+ * NOTE: like continue_at(), this macro expands to a return in the caller!
+ *
+ * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn,
+ * thus it's not safe to touch anything protected by @cl after a
+ * continue_at_nobarrier().
+ */
 #define continue_at_nobarrier(_cl, _fn, _wq)                            \
 do {                                                                    \
        set_closure_fn(_cl, _fn, _wq);                                  \
@@ -507,6 +352,15 @@ do {									\
        return;                                                         \
 } while (0)
+/**
+ * closure_return - finish execution of a closure, with destructor
+ *
+ * Works like closure_return(), except @destructor will be called when all
+ * outstanding refs on @cl have been dropped; @destructor may be used to safely
+ * free the memory occupied by @cl, and it is called with the ref on the parent
+ * closure still held - so @destructor could safely return an item to a
+ * freelist protected by @cl's parent.
+ */
 #define closure_return_with_destructor(_cl, _destructor)                \
 do {                                                                    \
        set_closure_fn(_cl, _destructor, NULL);                         \
@@ -514,6 +368,13 @@ do {									\
        return;                                                         \
 } while (0)
+/**
+ * closure_call - execute @fn out of a new, uninitialized closure
+ *
+ * Typically used when running out of one closure, and we want to run @fn
+ * asynchronously out of a new closure - @parent will then wait for @cl to
+ * finish.
+ */
 static inline void closure_call(struct closure *cl, closure_fn fn,
                                struct workqueue_struct *wq,
                                struct closure *parent)
@@ -522,12 +383,4 @@ static inline void closure_call(struct closure *cl, closure_fn fn,
        continue_at_nobarrier(cl, fn, wq);
 }
-static inline void closure_trylock_call(struct closure *cl, closure_fn fn,
-                                        struct workqueue_struct *wq,
-                                        struct closure *parent)
-{
-        if (closure_trylock(cl, parent))
-                continue_at_nobarrier(cl, fn, wq);
-}
 #endif /* _LINUX_CLOSURE_H */
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 264fcfbd6290..8b1f1d5c1819 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -8,6 +8,7 @@
 #include "bcache.h"
 #include "btree.h"
 #include "debug.h"
+#include "extents.h"
 #include <linux/console.h>
 #include <linux/debugfs.h>
@@ -17,163 +18,96 @@
 static struct dentry *debug;
-const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
-{
-        unsigned i;
-        for (i = 0; i < KEY_PTRS(k); i++)
-                if (ptr_available(c, k, i)) {
-                        struct cache *ca = PTR_CACHE(c, k, i);
-                        size_t bucket = PTR_BUCKET_NR(c, k, i);
-                        size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
-                        if (KEY_SIZE(k) + r > c->sb.bucket_size)
-                                return "bad, length too big";
-                        if (bucket <  ca->sb.first_bucket)
-                                return "bad, short offset";
-                        if (bucket >= ca->sb.nbuckets)
-                                return "bad, offset past end of device";
-                        if (ptr_stale(c, k, i))
-                                return "stale";
-                }
-        if (!bkey_cmp(k, &ZERO_KEY))
-                return "bad, null key";
-        if (!KEY_PTRS(k))
-                return "bad, no pointers";
-        if (!KEY_SIZE(k))
-                return "zeroed key";
-        return "";
-}
-int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k)
-{
-        unsigned i = 0;
-        char *out = buf, *end = buf + size;
-#define p(...)  (out += scnprintf(out, end - out, __VA_ARGS__))
-        p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k));
-        if (KEY_PTRS(k))
-                while (1) {
-                        p("%llu:%llu gen %llu",
-                          PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i));
-                        if (++i == KEY_PTRS(k))
-                                break;
-                        p(", ");
-                }
-        p("]");
-        if (KEY_DIRTY(k))
-                p(" dirty");
-        if (KEY_CSUM(k))
-                p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
-#undef p
-        return out - buf;
-}
 #ifdef CONFIG_BCACHE_DEBUG
-static void dump_bset(struct btree *b, struct bset *i)
+#define for_each_written_bset(b, start, i)                              \
-{
+        for (i = (start);                                               \
-        struct bkey *k, *next;
+             (void *) i < (void *) (start) + (KEY_SIZE(&b->key) << 9) &&\
-        unsigned j;
+             i->seq == (start)->seq;                                    \
-        char buf[80];
+             i = (void *) i + set_blocks(i, block_bytes(b->c)) *        \
+                 block_bytes(b->c))
-        for (k = i->start; k < end(i); k = next) {
-                next = bkey_next(k);
-                bch_bkey_to_text(buf, sizeof(buf), k);
-                printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b),
-                       (uint64_t *) k - i->d, i->keys, buf);
-                for (j = 0; j < KEY_PTRS(k); j++) {
-                        size_t n = PTR_BUCKET_NR(b->c, k, j);
-                        printk(" bucket %zu", n);
-                        if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)
-                                printk(" prio %i",
-                                       PTR_BUCKET(b->c, k, j)->prio);
-                }
-                printk(" %s\n", bch_ptr_status(b->c, k));
+void bch_btree_verify(struct btree *b)
-                if (next < end(i) &&
-                    bkey_cmp(k, !b->level ? &START_KEY(next) : next) > 0)
-                        printk(KERN_ERR "Key skipped backwards\n");
-        }
-}
-static void bch_dump_bucket(struct btree *b)
-{
-        unsigned i;
-        console_lock();
-        for (i = 0; i <= b->nsets; i++)
-                dump_bset(b, b->sets[i].data);
-        console_unlock();
-}
-void bch_btree_verify(struct btree *b, struct bset *new)
 {
        struct btree *v = b->c->verify_data;
-        struct closure cl;
+        struct bset *ondisk, *sorted, *inmemory;
-        closure_init_stack(&cl);
+        struct bio *bio;
-        if (!b->c->verify)
+        if (!b->c->verify || !b->c->verify_ondisk)
                return;
-        closure_wait_event(&b->io.wait, &cl,
+        down(&b->io_mutex);
-                           atomic_read(&b->io.cl.remaining) == -1);
        mutex_lock(&b->c->verify_lock);
+        ondisk = b->c->verify_ondisk;
+        sorted = b->c->verify_data->keys.set->data;
+        inmemory = b->keys.set->data;
        bkey_copy(&v->key, &b->key);
        v->written = 0;
        v->level = b->level;
+        v->keys.ops = b->keys.ops;
+        bio = bch_bbio_alloc(b->c);
+        bio->bi_bdev            = PTR_CACHE(b->c, &b->key, 0)->bdev;
+        bio->bi_iter.bi_sector  = PTR_OFFSET(&b->key, 0);
+        bio->bi_iter.bi_size    = KEY_SIZE(&v->key) << 9;
+        bch_bio_map(bio, sorted);
-        bch_btree_node_read(v);
+        submit_bio_wait(REQ_META|READ_SYNC, bio);
-        closure_wait_event(&v->io.wait, &cl,
+        bch_bbio_free(bio, b->c);
-                           atomic_read(&b->io.cl.remaining) == -1);
-        if (new->keys != v->sets[0].data->keys ||
+        memcpy(ondisk, sorted, KEY_SIZE(&v->key) << 9);
-            memcmp(new->start,
-                   v->sets[0].data->start,
+        bch_btree_node_read_done(v);
-                   (void *) end(new) - (void *) new->start)) {
+        sorted = v->keys.set->data;
-                unsigned i, j;
+        if (inmemory->keys != sorted->keys ||
+            memcmp(inmemory->start,
+                   sorted->start,
+                   (void *) bset_bkey_last(inmemory) - (void *) inmemory->start)) {
+                struct bset *i;
+                unsigned j;
                console_lock();
-                printk(KERN_ERR "*** original memory node:\n");
+                printk(KERN_ERR "*** in memory:\n");
-                for (i = 0; i <= b->nsets; i++)
+                bch_dump_bset(&b->keys, inmemory, 0);
-                        dump_bset(b, b->sets[i].data);
-                printk(KERN_ERR "*** sorted memory node:\n");
+                printk(KERN_ERR "*** read back in:\n");
-                dump_bset(b, new);
+                bch_dump_bset(&v->keys, sorted, 0);
-                printk(KERN_ERR "*** on disk node:\n");
+                for_each_written_bset(b, ondisk, i) {
-                dump_bset(v, v->sets[0].data);
+                        unsigned block = ((void *) i - (void *) ondisk) /
+                                block_bytes(b->c);
+                        printk(KERN_ERR "*** on disk block %u:\n", block);
+                        bch_dump_bset(&b->keys, i, block);
+                }
-                for (j = 0; j < new->keys; j++)
+                printk(KERN_ERR "*** block %zu not written\n",
-                        if (new->d[j] != v->sets[0].data->d[j])
+                       ((void *) i - (void *) ondisk) / block_bytes(b->c));
+                for (j = 0; j < inmemory->keys; j++)
+                        if (inmemory->d[j] != sorted->d[j])
                                break;
+                printk(KERN_ERR "b->written %u\n", b->written);
                console_unlock();
                panic("verify failed at %u\n", j);
        }
        mutex_unlock(&b->c->verify_lock);
+        up(&b->io_mutex);
 }
 void bch_data_verify(struct cached_dev *dc, struct bio *bio)
 {
        char name[BDEVNAME_SIZE];
        struct bio *check;
-        struct bio_vec *bv;
+        struct bio_vec bv, *bv2;
+        struct bvec_iter iter;
        int i;
        check = bio_clone(bio, GFP_NOIO);
@@ -185,95 +119,27 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
        submit_bio_wait(READ_SYNC, check);
-        bio_for_each_segment(bv, bio, i) {
+        bio_for_each_segment(bv, bio, iter) {
-                void *p1 = kmap_atomic(bv->bv_page);
+                void *p1 = kmap_atomic(bv.bv_page);
-                void *p2 = page_address(check->bi_io_vec[i].bv_page);
+                void *p2 = page_address(check->bi_io_vec[iter.bi_idx].bv_page);
-                cache_set_err_on(memcmp(p1 + bv->bv_offset,
+                cache_set_err_on(memcmp(p1 + bv.bv_offset,
-                                        p2 + bv->bv_offset,
+                                        p2 + bv.bv_offset,
-                                        bv->bv_len),
+                                        bv.bv_len),
                                 dc->disk.c,
                                 "verify failed at dev %s sector %llu",
                                 bdevname(dc->bdev, name),
-                                 (uint64_t) bio->bi_sector);
+                                 (uint64_t) bio->bi_iter.bi_sector);
                kunmap_atomic(p1);
        }
-        bio_for_each_segment_all(bv, check, i)
+        bio_for_each_segment_all(bv2, check, i)
-                __free_page(bv->bv_page);
+                __free_page(bv2->bv_page);
 out_put:
        bio_put(check);
 }
-int __bch_count_data(struct btree *b)
-{
-        unsigned ret = 0;
-        struct btree_iter iter;
-        struct bkey *k;
-        if (!b->level)
-                for_each_key(b, k, &iter)
-                        ret += KEY_SIZE(k);
-        return ret;
-}
-void __bch_check_keys(struct btree *b, const char *fmt, ...)
-{
-        va_list args;
-        struct bkey *k, *p = NULL;
-        struct btree_iter iter;
-        const char *err;
-        for_each_key(b, k, &iter) {
-                if (!b->level) {
-                        err = "Keys out of order";
-                        if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0)
-                                goto bug;
-                        if (bch_ptr_invalid(b, k))
-                                continue;
-                        err =  "Overlapping keys";
-                        if (p && bkey_cmp(p, &START_KEY(k)) > 0)
-                                goto bug;
-                } else {
-                        if (bch_ptr_bad(b, k))
-                                continue;
-                        err = "Duplicate keys";
-                        if (p && !bkey_cmp(p, k))
-                                goto bug;
-                }
-                p = k;
-        }
-        err = "Key larger than btree node key";
-        if (p && bkey_cmp(p, &b->key) > 0)
-                goto bug;
-        return;
-bug:
-        bch_dump_bucket(b);
-        va_start(args, fmt);
-        vprintk(fmt, args);
-        va_end(args);
-        panic("bcache error: %s:\n", err);
-}
-void bch_btree_iter_next_check(struct btree_iter *iter)
-{
-        struct bkey *k = iter->data->k, *next = bkey_next(k);
-        if (next < iter->data->end &&
-            bkey_cmp(k, iter->b->level ? next : &START_KEY(next)) > 0) {
-                bch_dump_bucket(iter->b);
-                panic("Key skipped backwards\n");
-        }
-}
 #endif
 #ifdef CONFIG_DEBUG_FS
@@ -320,7 +186,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf,
                if (!w)
                        break;
-                bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key);
+                bch_extent_to_text(kbuf, sizeof(kbuf), &w->key);
                i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf);
                bch_keybuf_del(&i->keys, w);
        }
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h
index 2ede60e31874..1f63c195d247 100644
--- a/drivers/md/bcache/debug.h
+++ b/drivers/md/bcache/debug.h
@@ -1,47 +1,30 @@
 #ifndef _BCACHE_DEBUG_H
 #define _BCACHE_DEBUG_H
-/* Btree/bkey debug printing */
+struct bio;
+struct cached_dev;
-int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k);
+struct cache_set;
 #ifdef CONFIG_BCACHE_DEBUG
-void bch_btree_verify(struct btree *, struct bset *);
+void bch_btree_verify(struct btree *);
 void bch_data_verify(struct cached_dev *, struct bio *);
-int __bch_count_data(struct btree *);
-void __bch_check_keys(struct btree *, const char *, ...);
-void bch_btree_iter_next_check(struct btree_iter *);
-#define EBUG_ON(cond)                   BUG_ON(cond)
 #define expensive_debug_checks(c)       ((c)->expensive_debug_checks)
 #define key_merging_disabled(c)         ((c)->key_merging_disabled)
 #define bypass_torture_test(d)          ((d)->bypass_torture_test)
 #else /* DEBUG */
-static inline void bch_btree_verify(struct btree *b, struct bset *i) {}
+static inline void bch_btree_verify(struct btree *b) {}
 static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {}
-static inline int __bch_count_data(struct btree *b) { return -1; }
-static inline void __bch_check_keys(struct btree *b, const char *fmt, ...) {}
-static inline void bch_btree_iter_next_check(struct btree_iter *iter) {}
-#define EBUG_ON(cond)                   do { if (cond); } while (0)
 #define expensive_debug_checks(c)       0
 #define key_merging_disabled(c)         0
 #define bypass_torture_test(d)          0
 #endif
-#define bch_count_data(b)                                               \
-        (expensive_debug_checks((b)->c) ? __bch_count_data(b) : -1)
-#define bch_check_keys(b, ...)                                          \
-do {                                                                    \
-        if (expensive_debug_checks((b)->c))                             \
-                __bch_check_keys(b, __VA_ARGS__);                       \
-} while (0)
 #ifdef CONFIG_DEBUG_FS
 void bch_debug_init_cache_set(struct cache_set *);
 #else
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
new file mode 100644
index 000000000000..416d1a3e028e
--- /dev/null
+++ b/drivers/md/bcache/extents.c
@@ -0,0 +1,616 @@
+/*
+ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
+ *
+ * Uses a block device as cache for other block devices; optimized for SSDs.
+ * All allocation is done in buckets, which should match the erase block size
+ * of the device.
+ *
+ * Buckets containing cached data are kept on a heap sorted by priority;
+ * bucket priority is increased on cache hit, and periodically all the buckets
+ * on the heap have their priority scaled down. This currently is just used as
+ * an LRU but in the future should allow for more intelligent heuristics.
+ *
+ * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
+ * counter. Garbage collection is used to remove stale pointers.
+ *
+ * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
+ * as keys are inserted we only sort the pages that have not yet been written.
+ * When garbage collection is run, we resort the entire node.
+ *
+ * All configuration is done via sysfs; see Documentation/bcache.txt.
+ */
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "extents.h"
+#include "writeback.h"
+static void sort_key_next(struct btree_iter *iter,
+                          struct btree_iter_set *i)
+{
+        i->k = bkey_next(i->k);
+        if (i->k == i->end)
+                *i = iter->data[--iter->used];
+}
+static bool bch_key_sort_cmp(struct btree_iter_set l,
+                             struct btree_iter_set r)
+{
+        int64_t c = bkey_cmp(l.k, r.k);
+        return c ? c > 0 : l.k < r.k;
+}
+static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
+{
+        unsigned i;
+        for (i = 0; i < KEY_PTRS(k); i++)
+                if (ptr_available(c, k, i)) {
+                        struct cache *ca = PTR_CACHE(c, k, i);
+                        size_t bucket = PTR_BUCKET_NR(c, k, i);
+                        size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
+                        if (KEY_SIZE(k) + r > c->sb.bucket_size ||
+                            bucket <  ca->sb.first_bucket ||
+                            bucket >= ca->sb.nbuckets)
+                                return true;
+                }
+        return false;
+}
+/* Common among btree and extent ptrs */
+static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
+{
+        unsigned i;
+        for (i = 0; i < KEY_PTRS(k); i++)
+                if (ptr_available(c, k, i)) {
+                        struct cache *ca = PTR_CACHE(c, k, i);
+                        size_t bucket = PTR_BUCKET_NR(c, k, i);
+                        size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
+                        if (KEY_SIZE(k) + r > c->sb.bucket_size)
+                                return "bad, length too big";
+                        if (bucket <  ca->sb.first_bucket)
+                                return "bad, short offset";
+                        if (bucket >= ca->sb.nbuckets)
+                                return "bad, offset past end of device";
+                        if (ptr_stale(c, k, i))
+                                return "stale";
+                }
+        if (!bkey_cmp(k, &ZERO_KEY))
+                return "bad, null key";
+        if (!KEY_PTRS(k))
+                return "bad, no pointers";
+        if (!KEY_SIZE(k))
+                return "zeroed key";
+        return "";
+}
+void bch_extent_to_text(char *buf, size_t size, const struct bkey *k)
+{
+        unsigned i = 0;
+        char *out = buf, *end = buf + size;
+#define p(...)  (out += scnprintf(out, end - out, __VA_ARGS__))
+        p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_START(k), KEY_SIZE(k));
+        for (i = 0; i < KEY_PTRS(k); i++) {
+                if (i)
+                        p(", ");
+                if (PTR_DEV(k, i) == PTR_CHECK_DEV)
+                        p("check dev");
+                else
+                        p("%llu:%llu gen %llu", PTR_DEV(k, i),
+                          PTR_OFFSET(k, i), PTR_GEN(k, i));
+        }
+        p("]");
+        if (KEY_DIRTY(k))
+                p(" dirty");
+        if (KEY_CSUM(k))
+                p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
+#undef p
+}
+static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k)
+{
+        struct btree *b = container_of(keys, struct btree, keys);
+        unsigned j;
+        char buf[80];
+        bch_extent_to_text(buf, sizeof(buf), k);
+        printk(" %s", buf);
+        for (j = 0; j < KEY_PTRS(k); j++) {
+                size_t n = PTR_BUCKET_NR(b->c, k, j);
+                printk(" bucket %zu", n);
+                if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)
+                        printk(" prio %i",
+                               PTR_BUCKET(b->c, k, j)->prio);
+        }
+        printk(" %s\n", bch_ptr_status(b->c, k));
+}
+/* Btree ptrs */
+bool __bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k)
+{
+        char buf[80];
+        if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))
+                goto bad;
+        if (__ptr_invalid(c, k))
+                goto bad;
+        return false;
+bad:
+        bch_extent_to_text(buf, sizeof(buf), k);
+        cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k));
+        return true;
+}
+static bool bch_btree_ptr_invalid(struct btree_keys *bk, const struct bkey *k)
+{
+        struct btree *b = container_of(bk, struct btree, keys);
+        return __bch_btree_ptr_invalid(b->c, k);
+}
+static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k)
+{
+        unsigned i;
+        char buf[80];
+        struct bucket *g;
+        if (mutex_trylock(&b->c->bucket_lock)) {
+                for (i = 0; i < KEY_PTRS(k); i++)
+                        if (ptr_available(b->c, k, i)) {
+                                g = PTR_BUCKET(b->c, k, i);
+                                if (KEY_DIRTY(k) ||
+                                    g->prio != BTREE_PRIO ||
+                                    (b->c->gc_mark_valid &&
+                                     GC_MARK(g) != GC_MARK_METADATA))
+                                        goto err;
+                        }
+                mutex_unlock(&b->c->bucket_lock);
+        }
+        return false;
+err:
+        mutex_unlock(&b->c->bucket_lock);
+        bch_extent_to_text(buf, sizeof(buf), k);
+        btree_bug(b,
+"inconsistent btree pointer %s: bucket %zi pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
+                  buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
+                  g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
+        return true;
+}
+static bool bch_btree_ptr_bad(struct btree_keys *bk, const struct bkey *k)
+{
+        struct btree *b = container_of(bk, struct btree, keys);
+        unsigned i;
+        if (!bkey_cmp(k, &ZERO_KEY) ||
+            !KEY_PTRS(k) ||
+            bch_ptr_invalid(bk, k))
+                return true;
+        for (i = 0; i < KEY_PTRS(k); i++)
+                if (!ptr_available(b->c, k, i) ||
+                    ptr_stale(b->c, k, i))
+                        return true;
+        if (expensive_debug_checks(b->c) &&
+            btree_ptr_bad_expensive(b, k))
+                return true;
+        return false;
+}
+static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk,
+                                       struct bkey *insert,
+                                       struct btree_iter *iter,
+                                       struct bkey *replace_key)
+{
+        struct btree *b = container_of(bk, struct btree, keys);
+        if (!KEY_OFFSET(insert))
+                btree_current_write(b)->prio_blocked++;
+        return false;
+}
+const struct btree_keys_ops bch_btree_keys_ops = {
+        .sort_cmp       = bch_key_sort_cmp,
+        .insert_fixup   = bch_btree_ptr_insert_fixup,
+        .key_invalid    = bch_btree_ptr_invalid,
+        .key_bad        = bch_btree_ptr_bad,
+        .key_to_text    = bch_extent_to_text,
+        .key_dump       = bch_bkey_dump,
+};
+/* Extents */
+/*
+ * Returns true if l > r - unless l == r, in which case returns true if l is
+ * older than r.
+ *
+ * Necessary for btree_sort_fixup() - if there are multiple keys that compare
+ * equal in different sets, we have to process them newest to oldest.
+ */
+static bool bch_extent_sort_cmp(struct btree_iter_set l,
+                                struct btree_iter_set r)
+{
+        int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k));
+        return c ? c > 0 : l.k < r.k;
+}
+static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
+                                          struct bkey *tmp)
+{
+        while (iter->used > 1) {
+                struct btree_iter_set *top = iter->data, *i = top + 1;
+                if (iter->used > 2 &&
+                    bch_extent_sort_cmp(i[0], i[1]))
+                        i++;
+                if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0)
+                        break;
+                if (!KEY_SIZE(i->k)) {
+                        sort_key_next(iter, i);
+                        heap_sift(iter, i - top, bch_extent_sort_cmp);
+                        continue;
+                }
+                if (top->k > i->k) {
+                        if (bkey_cmp(top->k, i->k) >= 0)
+                                sort_key_next(iter, i);
+                        else
+                                bch_cut_front(top->k, i->k);
+                        heap_sift(iter, i - top, bch_extent_sort_cmp);
+                } else {
+                        /* can't happen because of comparison func */
+                        BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k)));
+                        if (bkey_cmp(i->k, top->k) < 0) {
+                                bkey_copy(tmp, top->k);
+                                bch_cut_back(&START_KEY(i->k), tmp);
+                                bch_cut_front(i->k, top->k);
+                                heap_sift(iter, 0, bch_extent_sort_cmp);
+                                return tmp;
+                        } else {
+                                bch_cut_back(&START_KEY(i->k), top->k);
+                        }
+                }
+        }
+        return NULL;
+}
+static bool bch_extent_insert_fixup(struct btree_keys *b,
+                                    struct bkey *insert,
+                                    struct btree_iter *iter,
+                                    struct bkey *replace_key)
+{
+        struct cache_set *c = container_of(b, struct btree, keys)->c;
+        void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
+        {
+                if (KEY_DIRTY(k))
+                        bcache_dev_sectors_dirty_add(c, KEY_INODE(k),
+                                                     offset, -sectors);
+        }
+        uint64_t old_offset;
+        unsigned old_size, sectors_found = 0;
+        BUG_ON(!KEY_OFFSET(insert));
+        BUG_ON(!KEY_SIZE(insert));
+        while (1) {
+                struct bkey *k = bch_btree_iter_next(iter);
+                if (!k)
+                        break;
+                if (bkey_cmp(&START_KEY(k), insert) >= 0) {
+                        if (KEY_SIZE(k))
+                                break;
+                        else
+                                continue;
+                }
+                if (bkey_cmp(k, &START_KEY(insert)) <= 0)
+                        continue;
+                old_offset = KEY_START(k);
+                old_size = KEY_SIZE(k);
+                /*
+                 * We might overlap with 0 size extents; we can't skip these
+                 * because if they're in the set we're inserting to we have to
+                 * adjust them so they don't overlap with the key we're
+                 * inserting. But we don't want to check them for replace
+                 * operations.
+                 */
+                if (replace_key && KEY_SIZE(k)) {
+                        /*
+                         * k might have been split since we inserted/found the
+                         * key we're replacing
+                         */
+                        unsigned i;
+                        uint64_t offset = KEY_START(k) -
+                                KEY_START(replace_key);
+                        /* But it must be a subset of the replace key */
+                        if (KEY_START(k) < KEY_START(replace_key) ||
+                            KEY_OFFSET(k) > KEY_OFFSET(replace_key))
+                                goto check_failed;
+                        /* We didn't find a key that we were supposed to */
+                        if (KEY_START(k) > KEY_START(insert) + sectors_found)
+                                goto check_failed;
+                        if (!bch_bkey_equal_header(k, replace_key))
+                                goto check_failed;
+                        /* skip past gen */
+                        offset <<= 8;
+                        BUG_ON(!KEY_PTRS(replace_key));
+                        for (i = 0; i < KEY_PTRS(replace_key); i++)
+                                if (k->ptr[i] != replace_key->ptr[i] + offset)
+                                        goto check_failed;
+                        sectors_found = KEY_OFFSET(k) - KEY_START(insert);
+                }
+                if (bkey_cmp(insert, k) < 0 &&
+                    bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
+                        /*
+                         * We overlapped in the middle of an existing key: that
+                         * means we have to split the old key. But we have to do
+                         * slightly different things depending on whether the
+                         * old key has been written out yet.
+                         */
+                        struct bkey *top;
+                        subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
+                        if (bkey_written(b, k)) {
+                                /*
+                                 * We insert a new key to cover the top of the
+                                 * old key, and the old key is modified in place
+                                 * to represent the bottom split.
+                                 *
+                                 * It's completely arbitrary whether the new key
+                                 * is the top or the bottom, but it has to match
+                                 * up with what btree_sort_fixup() does - it
+                                 * doesn't check for this kind of overlap, it
+                                 * depends on us inserting a new key for the top
+                                 * here.
+                                 */
+                                top = bch_bset_search(b, bset_tree_last(b),
+                                                      insert);
+                                bch_bset_insert(b, top, k);
+                        } else {
+                                BKEY_PADDED(key) temp;
+                                bkey_copy(&temp.key, k);
+                                bch_bset_insert(b, k, &temp.key);
+                                top = bkey_next(k);
+                        }
+                        bch_cut_front(insert, top);
+                        bch_cut_back(&START_KEY(insert), k);
+                        bch_bset_fix_invalidated_key(b, k);
+                        goto out;
+                }
+                if (bkey_cmp(insert, k) < 0) {
+                        bch_cut_front(insert, k);
+                } else {
+                        if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
+                                old_offset = KEY_START(insert);
+                        if (bkey_written(b, k) &&
+                            bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
+                                /*
+                                 * Completely overwrote, so we don't have to
+                                 * invalidate the binary search tree
+                                 */
+                                bch_cut_front(k, k);
+                        } else {
+                                __bch_cut_back(&START_KEY(insert), k);
+                                bch_bset_fix_invalidated_key(b, k);
+                        }
+                }
+                subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
+        }
+check_failed:
+        if (replace_key) {
+                if (!sectors_found) {
+                        return true;
+                } else if (sectors_found < KEY_SIZE(insert)) {
+                        SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
+                                       (KEY_SIZE(insert) - sectors_found));
+                        SET_KEY_SIZE(insert, sectors_found);
+                }
+        }
+out:
+        if (KEY_DIRTY(insert))
+                bcache_dev_sectors_dirty_add(c, KEY_INODE(insert),
+                                             KEY_START(insert),
+                                             KEY_SIZE(insert));
+        return false;
+}
+static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k)
+{
+        struct btree *b = container_of(bk, struct btree, keys);
+        char buf[80];
+        if (!KEY_SIZE(k))
+                return true;
+        if (KEY_SIZE(k) > KEY_OFFSET(k))
+                goto bad;
+        if (__ptr_invalid(b->c, k))
+                goto bad;
+        return false;
+bad:
+        bch_extent_to_text(buf, sizeof(buf), k);
+        cache_bug(b->c, "spotted extent %s: %s", buf, bch_ptr_status(b->c, k));
+        return true;
+}
+static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k,
+                                     unsigned ptr)
+{
+        struct bucket *g = PTR_BUCKET(b->c, k, ptr);
+        char buf[80];
+        if (mutex_trylock(&b->c->bucket_lock)) {
+                if (b->c->gc_mark_valid &&
+                    ((GC_MARK(g) != GC_MARK_DIRTY &&
+                      KEY_DIRTY(k)) ||
+                     GC_MARK(g) == GC_MARK_METADATA))
+                        goto err;
+                if (g->prio == BTREE_PRIO)
+                        goto err;
+                mutex_unlock(&b->c->bucket_lock);
+        }
+        return false;
+err:
+        mutex_unlock(&b->c->bucket_lock);
+        bch_extent_to_text(buf, sizeof(buf), k);
+        btree_bug(b,
+"inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
+                  buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin),
+                  g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
+        return true;
+}
+static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k)
+{
+        struct btree *b = container_of(bk, struct btree, keys);
+        struct bucket *g;
+        unsigned i, stale;
+        if (!KEY_PTRS(k) ||
+            bch_extent_invalid(bk, k))
+                return true;
+        for (i = 0; i < KEY_PTRS(k); i++)
+                if (!ptr_available(b->c, k, i))
+                        return true;
+        if (!expensive_debug_checks(b->c) && KEY_DIRTY(k))
+                return false;
+        for (i = 0; i < KEY_PTRS(k); i++) {
+                g = PTR_BUCKET(b->c, k, i);
+                stale = ptr_stale(b->c, k, i);
+                btree_bug_on(stale > 96, b,
+                             "key too stale: %i, need_gc %u",
+                             stale, b->c->need_gc);
+                btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
+                             b, "stale dirty pointer");
+                if (stale)
+                        return true;
+                if (expensive_debug_checks(b->c) &&
+                    bch_extent_bad_expensive(b, k, i))
+                        return true;
+        }
+        return false;
+}
+static uint64_t merge_chksums(struct bkey *l, struct bkey *r)
+{
+        return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) &
+                ~((uint64_t)1 << 63);
+}
+static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey *r)
+{
+        struct btree *b = container_of(bk, struct btree, keys);
+        unsigned i;
+        if (key_merging_disabled(b->c))
+                return false;
+        for (i = 0; i < KEY_PTRS(l); i++)
+                if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
+                    PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
+                        return false;
+        /* Keys with no pointers aren't restricted to one bucket and could
+         * overflow KEY_SIZE
+         */
+        if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) {
+                SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l));
+                SET_KEY_SIZE(l, USHRT_MAX);
+                bch_cut_front(l, r);
+                return false;
+        }
+        if (KEY_CSUM(l)) {
+                if (KEY_CSUM(r))
+                        l->ptr[KEY_PTRS(l)] = merge_chksums(l, r);
+                else
+                        SET_KEY_CSUM(l, 0);
+        }
+        SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r));
+        SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r));
+        return true;
+}
+const struct btree_keys_ops bch_extent_keys_ops = {
+        .sort_cmp       = bch_extent_sort_cmp,
+        .sort_fixup     = bch_extent_sort_fixup,
+        .insert_fixup   = bch_extent_insert_fixup,
+        .key_invalid    = bch_extent_invalid,
+        .key_bad        = bch_extent_bad,
+        .key_merge      = bch_extent_merge,
+        .key_to_text    = bch_extent_to_text,
+        .key_dump       = bch_bkey_dump,
+        .is_extents     = true,
+};
diff --git a/drivers/md/bcache/extents.h b/drivers/md/bcache/extents.h
new file mode 100644
index 000000000000..e4e23409782d
--- /dev/null
+++ b/drivers/md/bcache/extents.h
@@ -0,0 +1,13 @@
+#ifndef _BCACHE_EXTENTS_H
+#define _BCACHE_EXTENTS_H
+extern const struct btree_keys_ops bch_btree_keys_ops;
+extern const struct btree_keys_ops bch_extent_keys_ops;
+struct bkey;
+struct cache_set;
+void bch_extent_to_text(char *, size_t, const struct bkey *);
+bool __bch_btree_ptr_invalid(struct cache_set *, const struct bkey *);
+#endif /* _BCACHE_EXTENTS_H */
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 9056632995b1..fa028fa82df4 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -11,178 +11,40 @@
 #include <linux/blkdev.h>
-static void bch_bi_idx_hack_endio(struct bio *bio, int error)
-{
-        struct bio *p = bio->bi_private;
-        bio_endio(p, error);
-        bio_put(bio);
-}
-static void bch_generic_make_request_hack(struct bio *bio)
-{
-        if (bio->bi_idx) {
-                struct bio *clone = bio_alloc(GFP_NOIO, bio_segments(bio));
-                memcpy(clone->bi_io_vec,
-                       bio_iovec(bio),
-                       bio_segments(bio) * sizeof(struct bio_vec));
-                clone->bi_sector        = bio->bi_sector;
-                clone->bi_bdev          = bio->bi_bdev;
-                clone->bi_rw            = bio->bi_rw;
-                clone->bi_vcnt          = bio_segments(bio);
-                clone->bi_size          = bio->bi_size;
-                clone->bi_private       = bio;
-                clone->bi_end_io        = bch_bi_idx_hack_endio;
-                bio = clone;
-        }
-        /*
-         * Hack, since drivers that clone bios clone up to bi_max_vecs, but our
-         * bios might have had more than that (before we split them per device
-         * limitations).
-         *
-         * To be taken out once immutable bvec stuff is in.
-         */
-        bio->bi_max_vecs = bio->bi_vcnt;
-        generic_make_request(bio);
-}
-/**
- * bch_bio_split - split a bio
- * @bio:        bio to split
- * @sectors:    number of sectors to split from the front of @bio
- * @gfp:        gfp mask
- * @bs:         bio set to allocate from
- *
- * Allocates and returns a new bio which represents @sectors from the start of
- * @bio, and updates @bio to represent the remaining sectors.
- *
- * If bio_sectors(@bio) was less than or equal to @sectors, returns @bio
- * unchanged.
- *
- * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a
- * bvec boundry; it is the caller's responsibility to ensure that @bio is not
- * freed before the split.
- */
-struct bio *bch_bio_split(struct bio *bio, int sectors,
-                          gfp_t gfp, struct bio_set *bs)
-{
-        unsigned idx = bio->bi_idx, vcnt = 0, nbytes = sectors << 9;
-        struct bio_vec *bv;
-        struct bio *ret = NULL;
-        BUG_ON(sectors <= 0);
-        if (sectors >= bio_sectors(bio))
-                return bio;
-        if (bio->bi_rw & REQ_DISCARD) {
-                ret = bio_alloc_bioset(gfp, 1, bs);
-                if (!ret)
-                        return NULL;
-                idx = 0;
-                goto out;
-        }
-        bio_for_each_segment(bv, bio, idx) {
-                vcnt = idx - bio->bi_idx;
-                if (!nbytes) {
-                        ret = bio_alloc_bioset(gfp, vcnt, bs);
-                        if (!ret)
-                                return NULL;
-                        memcpy(ret->bi_io_vec, bio_iovec(bio),
-                               sizeof(struct bio_vec) * vcnt);
-                        break;
-                } else if (nbytes < bv->bv_len) {
-                        ret = bio_alloc_bioset(gfp, ++vcnt, bs);
-                        if (!ret)
-                                return NULL;
-                        memcpy(ret->bi_io_vec, bio_iovec(bio),
-                               sizeof(struct bio_vec) * vcnt);
-                        ret->bi_io_vec[vcnt - 1].bv_len = nbytes;
-                        bv->bv_offset   += nbytes;
-                        bv->bv_len      -= nbytes;
-                        break;
-                }
-                nbytes -= bv->bv_len;
-        }
-out:
-        ret->bi_bdev    = bio->bi_bdev;
-        ret->bi_sector  = bio->bi_sector;
-        ret->bi_size    = sectors << 9;
-        ret->bi_rw      = bio->bi_rw;
-        ret->bi_vcnt    = vcnt;
-        ret->bi_max_vecs = vcnt;
-        bio->bi_sector  += sectors;
-        bio->bi_size    -= sectors << 9;
-        bio->bi_idx      = idx;
-        if (bio_integrity(bio)) {
-                if (bio_integrity_clone(ret, bio, gfp)) {
-                        bio_put(ret);
-                        return NULL;
-                }
-                bio_integrity_trim(ret, 0, bio_sectors(ret));
-                bio_integrity_trim(bio, bio_sectors(ret), bio_sectors(bio));
-        }
-        return ret;
-}
 static unsigned bch_bio_max_sectors(struct bio *bio)
 {
-        unsigned ret = bio_sectors(bio);
        struct request_queue *q = bdev_get_queue(bio->bi_bdev);
-        unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES,
+        struct bio_vec bv;
-                                      queue_max_segments(q));
+        struct bvec_iter iter;
+        unsigned ret = 0, seg = 0;
        if (bio->bi_rw & REQ_DISCARD)
-                return min(ret, q->limits.max_discard_sectors);
+                return min(bio_sectors(bio), q->limits.max_discard_sectors);
-        if (bio_segments(bio) > max_segments ||
+        bio_for_each_segment(bv, bio, iter) {
-            q->merge_bvec_fn) {
+                struct bvec_merge_data bvm = {
-                struct bio_vec *bv;
+                        .bi_bdev        = bio->bi_bdev,
-                int i, seg = 0;
+                        .bi_sector      = bio->bi_iter.bi_sector,
+                        .bi_size        = ret << 9,
-                ret = 0;
+                        .bi_rw          = bio->bi_rw,
+                };
-                bio_for_each_segment(bv, bio, i) {
-                        struct bvec_merge_data bvm = {
+                if (seg == min_t(unsigned, BIO_MAX_PAGES,
-                                .bi_bdev        = bio->bi_bdev,
+                                 queue_max_segments(q)))
-                                .bi_sector      = bio->bi_sector,
+                        break;
-                                .bi_size        = ret << 9,
-                                .bi_rw          = bio->bi_rw,
-                        };
-                        if (seg == max_segments)
-                                break;
-                        if (q->merge_bvec_fn &&
+                if (q->merge_bvec_fn &&
-                            q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len)
+                    q->merge_bvec_fn(q, &bvm, &bv) < (int) bv.bv_len)
-                                break;
+                        break;
-                        seg++;
+                seg++;
-                        ret += bv->bv_len >> 9;
+                ret += bv.bv_len >> 9;
-                }
        }
        ret = min(ret, queue_max_sectors(q));
        WARN_ON(!ret);
-        ret = max_t(int, ret, bio_iovec(bio)->bv_len >> 9);
+        ret = max_t(int, ret, bio_iovec(bio).bv_len >> 9);
        return ret;
 }
@@ -193,7 +55,7 @@ static void bch_bio_submit_split_done(struct closure *cl)
        s->bio->bi_end_io = s->bi_end_io;
        s->bio->bi_private = s->bi_private;
-        bio_endio(s->bio, 0);
+        bio_endio_nodec(s->bio, 0);
        closure_debug_destroy(&s->cl);
        mempool_free(s, s->p->bio_split_hook);
@@ -232,19 +94,19 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
        bio_get(bio);
        do {
-                n = bch_bio_split(bio, bch_bio_max_sectors(bio),
+                n = bio_next_split(bio, bch_bio_max_sectors(bio),
-                                  GFP_NOIO, s->p->bio_split);
+                                   GFP_NOIO, s->p->bio_split);
                n->bi_end_io    = bch_bio_submit_split_endio;
                n->bi_private   = &s->cl;
                closure_get(&s->cl);
-                bch_generic_make_request_hack(n);
+                generic_make_request(n);
        } while (n != bio);
        continue_at(&s->cl, bch_bio_submit_split_done, NULL);
 submit:
-        bch_generic_make_request_hack(bio);
+        generic_make_request(bio);
 }
 /* Bios with headers */
@@ -272,8 +134,8 @@ void __bch_submit_bbio(struct bio *bio, struct cache_set *c)
 {
        struct bbio *b = container_of(bio, struct bbio, bio);
-        bio->bi_sector  = PTR_OFFSET(&b->key, 0);
+        bio->bi_iter.bi_sector  = PTR_OFFSET(&b->key, 0);
-        bio->bi_bdev    = PTR_CACHE(c, &b->key, 0)->bdev;
+        bio->bi_bdev            = PTR_CACHE(c, &b->key, 0)->bdev;
        b->submit_time_us = local_clock_us();
        closure_bio_submit(bio, bio->bi_private, PTR_CACHE(c, &b->key, 0));
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index ecdaa671bd50..18039affc306 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -44,17 +44,17 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list,
        closure_init_stack(&cl);
-        pr_debug("reading %llu", (uint64_t) bucket);
+        pr_debug("reading %u", bucket_index);
        while (offset < ca->sb.bucket_size) {
 reread:         left = ca->sb.bucket_size - offset;
-                len = min_t(unsigned, left, PAGE_SECTORS * 8);
+                len = min_t(unsigned, left, PAGE_SECTORS << JSET_BITS);
                bio_reset(bio);
-                bio->bi_sector  = bucket + offset;
+                bio->bi_iter.bi_sector  = bucket + offset;
                bio->bi_bdev    = ca->bdev;
                bio->bi_rw      = READ;
-                bio->bi_size    = len << 9;
+                bio->bi_iter.bi_size    = len << 9;
                bio->bi_end_io  = journal_read_endio;
                bio->bi_private = &cl;
@@ -74,19 +74,28 @@ reread:		left = ca->sb.bucket_size - offset;
                        struct list_head *where;
                        size_t blocks, bytes = set_bytes(j);
-                        if (j->magic != jset_magic(&ca->sb))
+                        if (j->magic != jset_magic(&ca->sb)) {
+                                pr_debug("%u: bad magic", bucket_index);
                                return ret;
+                        }
-                        if (bytes > left << 9)
+                        if (bytes > left << 9 ||
+                            bytes > PAGE_SIZE << JSET_BITS) {
+                                pr_info("%u: too big, %zu bytes, offset %u",
+                                        bucket_index, bytes, offset);
                                return ret;
+                        }
                        if (bytes > len << 9)
                                goto reread;
-                        if (j->csum != csum_set(j))
+                        if (j->csum != csum_set(j)) {
+                                pr_info("%u: bad csum, %zu bytes, offset %u",
+                                        bucket_index, bytes, offset);
                                return ret;
+                        }
-                        blocks = set_blocks(j, ca->set);
+                        blocks = set_blocks(j, block_bytes(ca->set));
                        while (!list_empty(list)) {
                                i = list_first_entry(list,
@@ -275,7 +284,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
                }
                for (k = i->j.start;
-                     k < end(&i->j);
+                     k < bset_bkey_last(&i->j);
                     k = bkey_next(k)) {
                        unsigned j;
@@ -313,7 +322,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
                                 n, i->j.seq - 1, start, end);
                for (k = i->j.start;
-                     k < end(&i->j);
+                     k < bset_bkey_last(&i->j);
                     k = bkey_next(k)) {
                        trace_bcache_journal_replay_key(k);
@@ -437,13 +446,13 @@ static void do_journal_discard(struct cache *ca)
                atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT);
                bio_init(bio);
-                bio->bi_sector          = bucket_to_sector(ca->set,
+                bio->bi_iter.bi_sector  = bucket_to_sector(ca->set,
                                                ca->sb.d[ja->discard_idx]);
                bio->bi_bdev            = ca->bdev;
                bio->bi_rw              = REQ_WRITE|REQ_DISCARD;
                bio->bi_max_vecs        = 1;
                bio->bi_io_vec          = bio->bi_inline_vecs;
-                bio->bi_size            = bucket_bytes(ca);
+                bio->bi_iter.bi_size    = bucket_bytes(ca);
                bio->bi_end_io          = journal_discard_endio;
                closure_get(&ca->set->cl);
@@ -555,6 +564,14 @@ static void journal_write_done(struct closure *cl)
        continue_at_nobarrier(cl, journal_write, system_wq);
 }
+static void journal_write_unlock(struct closure *cl)
+{
+        struct cache_set *c = container_of(cl, struct cache_set, journal.io);
+        c->journal.io_in_flight = 0;
+        spin_unlock(&c->journal.lock);
+}
 static void journal_write_unlocked(struct closure *cl)
        __releases(c->journal.lock)
 {
@@ -562,22 +579,15 @@ static void journal_write_unlocked(struct closure *cl)
        struct cache *ca;
        struct journal_write *w = c->journal.cur;
        struct bkey *k = &c->journal.key;
-        unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size;
+        unsigned i, sectors = set_blocks(w->data, block_bytes(c)) *
+                c->sb.block_size;
        struct bio *bio;
        struct bio_list list;
        bio_list_init(&list);
        if (!w->need_write) {
-                /*
+                closure_return_with_destructor(cl, journal_write_unlock);
-                 * XXX: have to unlock closure before we unlock journal lock,
-                 * else we race with bch_journal(). But this way we race
-                 * against cache set unregister. Doh.
-                 */
-                set_closure_fn(cl, NULL, NULL);
-                closure_sub(cl, CLOSURE_RUNNING + 1);
-                spin_unlock(&c->journal.lock);
-                return;
        } else if (journal_full(&c->journal)) {
                journal_reclaim(c);
                spin_unlock(&c->journal.lock);
@@ -586,7 +596,7 @@ static void journal_write_unlocked(struct closure *cl)
                continue_at(cl, journal_write, system_wq);
        }
-        c->journal.blocks_free -= set_blocks(w->data, c);
+        c->journal.blocks_free -= set_blocks(w->data, block_bytes(c));
        w->data->btree_level = c->root->level;
@@ -608,10 +618,10 @@ static void journal_write_unlocked(struct closure *cl)
                atomic_long_add(sectors, &ca->meta_sectors_written);
                bio_reset(bio);
-                bio->bi_sector  = PTR_OFFSET(k, i);
+                bio->bi_iter.bi_sector  = PTR_OFFSET(k, i);
                bio->bi_bdev    = ca->bdev;
                bio->bi_rw      = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH|REQ_FUA;
-                bio->bi_size    = sectors << 9;
+                bio->bi_iter.bi_size = sectors << 9;
                bio->bi_end_io  = journal_write_endio;
                bio->bi_private = w;
@@ -653,10 +663,12 @@ static void journal_try_write(struct cache_set *c)
        w->need_write = true;
-        if (closure_trylock(cl, &c->cl))
+        if (!c->journal.io_in_flight) {
-                journal_write_unlocked(cl);
+                c->journal.io_in_flight = 1;
-        else
+                closure_call(cl, journal_write_unlocked, NULL, &c->cl);
+        } else {
                spin_unlock(&c->journal.lock);
+        }
 }
 static struct journal_write *journal_wait_for_write(struct cache_set *c,
@@ -664,6 +676,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
 {
        size_t sectors;
        struct closure cl;
+        bool wait = false;
        closure_init_stack(&cl);
@@ -673,16 +686,19 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
                struct journal_write *w = c->journal.cur;
                sectors = __set_blocks(w->data, w->data->keys + nkeys,
-                                       c) * c->sb.block_size;
+                                       block_bytes(c)) * c->sb.block_size;
                if (sectors <= min_t(size_t,
                                     c->journal.blocks_free * c->sb.block_size,
                                     PAGE_SECTORS << JSET_BITS))
                        return w;
-                /* XXX: tracepoint */
+                if (wait)
+                        closure_wait(&c->journal.wait, &cl);
                if (!journal_full(&c->journal)) {
-                        trace_bcache_journal_entry_full(c);
+                        if (wait)
+                                trace_bcache_journal_entry_full(c);
                        /*
                         * XXX: If we were inserting so many keys that they
@@ -692,12 +708,11 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
                         */
                        BUG_ON(!w->data->keys);
-                        closure_wait(&w->wait, &cl);
                        journal_try_write(c); /* unlocks */
                } else {
-                        trace_bcache_journal_full(c);
+                        if (wait)
+                                trace_bcache_journal_full(c);
-                        closure_wait(&c->journal.wait, &cl);
                        journal_reclaim(c);
                        spin_unlock(&c->journal.lock);
@@ -706,6 +721,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
                closure_sync(&cl);
                spin_lock(&c->journal.lock);
+                wait = true;
        }
 }
@@ -736,7 +752,7 @@ atomic_t *bch_journal(struct cache_set *c,
        w = journal_wait_for_write(c, bch_keylist_nkeys(keys));
-        memcpy(end(w->data), keys->keys, bch_keylist_bytes(keys));
+        memcpy(bset_bkey_last(w->data), keys->keys, bch_keylist_bytes(keys));
        w->data->keys += bch_keylist_nkeys(keys);
        ret = &fifo_back(&c->journal.pin);
@@ -780,7 +796,6 @@ int bch_journal_alloc(struct cache_set *c)
 {
        struct journal *j = &c->journal;
-        closure_init_unlocked(&j->io);
        spin_lock_init(&j->lock);
        INIT_DELAYED_WORK(&j->work, journal_write_work);
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
index a6472fda94b2..9180c4465075 100644
--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@@ -104,6 +104,7 @@ struct journal {
        /* used when waiting because the journal was full */
        struct closure_waitlist wait;
        struct closure          io;
+        int                     io_in_flight;
        struct delayed_work     work;
        /* Number of blocks free in the bucket(s) we're currently writing to */
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index f2f0998c4a91..9eb60d102de8 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -86,7 +86,7 @@ static void moving_init(struct moving_io *io)
        bio_get(bio);
        bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
-        bio->bi_size            = KEY_SIZE(&io->w->key) << 9;
+        bio->bi_iter.bi_size    = KEY_SIZE(&io->w->key) << 9;
        bio->bi_max_vecs        = DIV_ROUND_UP(KEY_SIZE(&io->w->key),
                                               PAGE_SECTORS);
        bio->bi_private         = &io->cl;
@@ -102,7 +102,7 @@ static void write_moving(struct closure *cl)
        if (!op->error) {
                moving_init(io);
-                io->bio.bio.bi_sector = KEY_START(&io->w->key);
+                io->bio.bio.bi_iter.bi_sector = KEY_START(&io->w->key);
                op->write_prio          = 1;
                op->bio                 = &io->bio.bio;
@@ -211,7 +211,7 @@ void bch_moving_gc(struct cache_set *c)
        for_each_cache(ca, c, i) {
                unsigned sectors_to_move = 0;
                unsigned reserve_sectors = ca->sb.bucket_size *
-                        min(fifo_used(&ca->free), ca->free.size / 2);
+                        fifo_used(&ca->free[RESERVE_MOVINGGC]);
                ca->heap.used = 0;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index fbcc851ed5a5..5d5d031cf381 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -163,7 +163,6 @@ static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup)
 static void bcachecg_destroy(struct cgroup *cgroup)
 {
        struct bch_cgroup *cg = cgroup_to_bcache(cgroup);
-        free_css_id(&bcache_subsys, &cg->css);
        kfree(cg);
 }
@@ -198,14 +197,14 @@ static bool verify(struct cached_dev *dc, struct bio *bio)
 static void bio_csum(struct bio *bio, struct bkey *k)
 {
-        struct bio_vec *bv;
+        struct bio_vec bv;
+        struct bvec_iter iter;
        uint64_t csum = 0;
-        int i;
-        bio_for_each_segment(bv, bio, i) {
+        bio_for_each_segment(bv, bio, iter) {
-                void *d = kmap(bv->bv_page) + bv->bv_offset;
+                void *d = kmap(bv.bv_page) + bv.bv_offset;
-                csum = bch_crc64_update(csum, d, bv->bv_len);
+                csum = bch_crc64_update(csum, d, bv.bv_len);
-                kunmap(bv->bv_page);
+                kunmap(bv.bv_page);
        }
        k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1);
@@ -255,26 +254,44 @@ static void bch_data_insert_keys(struct closure *cl)
        closure_return(cl);
 }
+static int bch_keylist_realloc(struct keylist *l, unsigned u64s,
+                               struct cache_set *c)
+{
+        size_t oldsize = bch_keylist_nkeys(l);
+        size_t newsize = oldsize + u64s;
+        /*
+         * The journalling code doesn't handle the case where the keys to insert
+         * is bigger than an empty write: If we just return -ENOMEM here,
+         * bio_insert() and bio_invalidate() will insert the keys created so far
+         * and finish the rest when the keylist is empty.
+         */
+        if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
+                return -ENOMEM;
+        return __bch_keylist_realloc(l, u64s);
+}
 static void bch_data_invalidate(struct closure *cl)
 {
        struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
        struct bio *bio = op->bio;
        pr_debug("invalidating %i sectors from %llu",
-                 bio_sectors(bio), (uint64_t) bio->bi_sector);
+                 bio_sectors(bio), (uint64_t) bio->bi_iter.bi_sector);
        while (bio_sectors(bio)) {
                unsigned sectors = min(bio_sectors(bio),
                                       1U << (KEY_SIZE_BITS - 1));
-                if (bch_keylist_realloc(&op->insert_keys, 0, op->c))
+                if (bch_keylist_realloc(&op->insert_keys, 2, op->c))
                        goto out;
-                bio->bi_sector  += sectors;
+                bio->bi_iter.bi_sector  += sectors;
-                bio->bi_size    -= sectors << 9;
+                bio->bi_iter.bi_size    -= sectors << 9;
                bch_keylist_add(&op->insert_keys,
-                                &KEY(op->inode, bio->bi_sector, sectors));
+                                &KEY(op->inode, bio->bi_iter.bi_sector, sectors));
        }
        op->insert_data_done = true;
@@ -336,14 +353,14 @@ static void bch_data_insert_start(struct closure *cl)
        struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
        struct bio *bio = op->bio, *n;
-        if (op->bypass)
-                return bch_data_invalidate(cl);
        if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) {
                set_gc_sectors(op->c);
                wake_up_gc(op->c);
        }
+        if (op->bypass)
+                return bch_data_invalidate(cl);
        /*
         * Journal writes are marked REQ_FLUSH; if the original write was a
         * flush, it'll wait on the journal write.
@@ -357,21 +374,21 @@ static void bch_data_insert_start(struct closure *cl)
                /* 1 for the device pointer and 1 for the chksum */
                if (bch_keylist_realloc(&op->insert_keys,
-                                        1 + (op->csum ? 1 : 0),
+                                        3 + (op->csum ? 1 : 0),
                                        op->c))
                        continue_at(cl, bch_data_insert_keys, bcache_wq);
                k = op->insert_keys.top;
                bkey_init(k);
                SET_KEY_INODE(k, op->inode);
-                SET_KEY_OFFSET(k, bio->bi_sector);
+                SET_KEY_OFFSET(k, bio->bi_iter.bi_sector);
                if (!bch_alloc_sectors(op->c, k, bio_sectors(bio),
                                       op->write_point, op->write_prio,
                                       op->writeback))
                        goto err;
-                n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split);
+                n = bio_next_split(bio, KEY_SIZE(k), GFP_NOIO, split);
                n->bi_end_io    = bch_data_insert_endio;
                n->bi_private   = cl;
@@ -522,7 +539,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
             (bio->bi_rw & REQ_WRITE)))
                goto skip;
-        if (bio->bi_sector & (c->sb.block_size - 1) ||
+        if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
            bio_sectors(bio) & (c->sb.block_size - 1)) {
                pr_debug("skipping unaligned io");
                goto skip;
@@ -546,8 +563,8 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
        spin_lock(&dc->io_lock);
-        hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash)
+        hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash)
-                if (i->last == bio->bi_sector &&
+                if (i->last == bio->bi_iter.bi_sector &&
                    time_before(jiffies, i->jiffies))
                        goto found;
@@ -556,8 +573,8 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
        add_sequential(task);
        i->sequential = 0;
 found:
-        if (i->sequential + bio->bi_size > i->sequential)
+        if (i->sequential + bio->bi_iter.bi_size > i->sequential)
-                i->sequential   += bio->bi_size;
+                i->sequential   += bio->bi_iter.bi_size;
        i->last                  = bio_end_sector(bio);
        i->jiffies               = jiffies + msecs_to_jiffies(5000);
@@ -597,16 +614,13 @@ struct search {
        /* Stack frame for bio_complete */
        struct closure          cl;
-        struct bcache_device    *d;
        struct bbio             bio;
        struct bio              *orig_bio;
        struct bio              *cache_miss;
+        struct bcache_device    *d;
        unsigned                insert_bio_sectors;
        unsigned                recoverable:1;
-        unsigned                unaligned_bvec:1;
        unsigned                write:1;
        unsigned                read_dirty_data:1;
@@ -631,7 +645,8 @@ static void bch_cache_read_endio(struct bio *bio, int error)
        if (error)
                s->iop.error = error;
-        else if (ptr_stale(s->iop.c, &b->key, 0)) {
+        else if (!KEY_DIRTY(&b->key) &&
+                 ptr_stale(s->iop.c, &b->key, 0)) {
                atomic_long_inc(&s->iop.c->cache_read_races);
                s->iop.error = -EINTR;
        }
@@ -650,15 +665,15 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
        struct bkey *bio_key;
        unsigned ptr;
-        if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_sector, 0)) <= 0)
+        if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0)) <= 0)
                return MAP_CONTINUE;
        if (KEY_INODE(k) != s->iop.inode ||
-            KEY_START(k) > bio->bi_sector) {
+            KEY_START(k) > bio->bi_iter.bi_sector) {
                unsigned bio_sectors = bio_sectors(bio);
                unsigned sectors = KEY_INODE(k) == s->iop.inode
                        ? min_t(uint64_t, INT_MAX,
-                                KEY_START(k) - bio->bi_sector)
+                                KEY_START(k) - bio->bi_iter.bi_sector)
                        : INT_MAX;
                int ret = s->d->cache_miss(b, s, bio, sectors);
@@ -680,14 +695,14 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
        if (KEY_DIRTY(k))
                s->read_dirty_data = true;
-        n = bch_bio_split(bio, min_t(uint64_t, INT_MAX,
+        n = bio_next_split(bio, min_t(uint64_t, INT_MAX,
-                                     KEY_OFFSET(k) - bio->bi_sector),
+                                      KEY_OFFSET(k) - bio->bi_iter.bi_sector),
-                          GFP_NOIO, s->d->bio_split);
+                           GFP_NOIO, s->d->bio_split);
        bio_key = &container_of(n, struct bbio, bio)->key;
        bch_bkey_copy_single_ptr(bio_key, k, ptr);
-        bch_cut_front(&KEY(s->iop.inode, n->bi_sector, 0), bio_key);
+        bch_cut_front(&KEY(s->iop.inode, n->bi_iter.bi_sector, 0), bio_key);
        bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key);
        n->bi_end_io    = bch_cache_read_endio;
@@ -712,10 +727,13 @@ static void cache_lookup(struct closure *cl)
 {
        struct search *s = container_of(cl, struct search, iop.cl);
        struct bio *bio = &s->bio.bio;
+        int ret;
+        bch_btree_op_init(&s->op, -1);
-        int ret = bch_btree_map_keys(&s->op, s->iop.c,
+        ret = bch_btree_map_keys(&s->op, s->iop.c,
-                                     &KEY(s->iop.inode, bio->bi_sector, 0),
+                                 &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0),
-                                     cache_lookup_fn, MAP_END_KEY);
+                                 cache_lookup_fn, MAP_END_KEY);
        if (ret == -EAGAIN)
                continue_at(cl, cache_lookup, bcache_wq);
@@ -756,13 +774,15 @@ static void bio_complete(struct search *s)
        }
 }
-static void do_bio_hook(struct search *s)
+static void do_bio_hook(struct search *s, struct bio *orig_bio)
 {
        struct bio *bio = &s->bio.bio;
-        memcpy(bio, s->orig_bio, sizeof(struct bio));
+        bio_init(bio);
+        __bio_clone_fast(bio, orig_bio);
        bio->bi_end_io          = request_endio;
        bio->bi_private         = &s->cl;
        atomic_set(&bio->bi_cnt, 3);
 }
@@ -774,43 +794,36 @@ static void search_free(struct closure *cl)
        if (s->iop.bio)
                bio_put(s->iop.bio);
-        if (s->unaligned_bvec)
-                mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec);
        closure_debug_destroy(cl);
        mempool_free(s, s->d->c->search);
 }
-static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
+static inline struct search *search_alloc(struct bio *bio,
+                                          struct bcache_device *d)
 {
        struct search *s;
-        struct bio_vec *bv;
        s = mempool_alloc(d->c->search, GFP_NOIO);
-        memset(s, 0, offsetof(struct search, iop.insert_keys));
-        __closure_init(&s->cl, NULL);
+        closure_init(&s->cl, NULL);
+        do_bio_hook(s, bio);
-        s->iop.inode            = d->id;
-        s->iop.c                = d->c;
-        s->d                    = d;
-        s->op.lock              = -1;
-        s->iop.write_point      = hash_long((unsigned long) current, 16);
        s->orig_bio             = bio;
-        s->write                = (bio->bi_rw & REQ_WRITE) != 0;
+        s->cache_miss           = NULL;
-        s->iop.flush_journal    = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
+        s->d                    = d;
        s->recoverable          = 1;
+        s->write                = (bio->bi_rw & REQ_WRITE) != 0;
+        s->read_dirty_data      = 0;
        s->start_time           = jiffies;
-        do_bio_hook(s);
-        if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) {
+        s->iop.c                = d->c;
-                bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO);
+        s->iop.bio              = NULL;
-                memcpy(bv, bio_iovec(bio),
+        s->iop.inode            = d->id;
-                       sizeof(struct bio_vec) * bio_segments(bio));
+        s->iop.write_point      = hash_long((unsigned long) current, 16);
+        s->iop.write_prio       = 0;
-                s->bio.bio.bi_io_vec    = bv;
+        s->iop.error            = 0;
-                s->unaligned_bvec       = 1;
+        s->iop.flags            = 0;
-        }
+        s->iop.flush_journal    = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
        return s;
 }
@@ -850,26 +863,13 @@ static void cached_dev_read_error(struct closure *cl)
 {
        struct search *s = container_of(cl, struct search, cl);
        struct bio *bio = &s->bio.bio;
-        struct bio_vec *bv;
-        int i;
        if (s->recoverable) {
                /* Retry from the backing device: */
                trace_bcache_read_retry(s->orig_bio);
                s->iop.error = 0;
-                bv = s->bio.bio.bi_io_vec;
+                do_bio_hook(s, s->orig_bio);
-                do_bio_hook(s);
-                s->bio.bio.bi_io_vec = bv;
-                if (!s->unaligned_bvec)
-                        bio_for_each_segment(bv, s->orig_bio, i)
-                                bv->bv_offset = 0, bv->bv_len = PAGE_SIZE;
-                else
-                        memcpy(s->bio.bio.bi_io_vec,
-                               bio_iovec(s->orig_bio),
-                               sizeof(struct bio_vec) *
-                               bio_segments(s->orig_bio));
                /* XXX: invalidate cache */
@@ -894,9 +894,9 @@ static void cached_dev_read_done(struct closure *cl)
        if (s->iop.bio) {
                bio_reset(s->iop.bio);
-                s->iop.bio->bi_sector = s->cache_miss->bi_sector;
+                s->iop.bio->bi_iter.bi_sector = s->cache_miss->bi_iter.bi_sector;
                s->iop.bio->bi_bdev = s->cache_miss->bi_bdev;
-                s->iop.bio->bi_size = s->insert_bio_sectors << 9;
+                s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9;
                bch_bio_map(s->iop.bio, NULL);
                bio_copy_data(s->cache_miss, s->iop.bio);
@@ -905,8 +905,7 @@ static void cached_dev_read_done(struct closure *cl)
                s->cache_miss = NULL;
        }
-        if (verify(dc, &s->bio.bio) && s->recoverable &&
+        if (verify(dc, &s->bio.bio) && s->recoverable && !s->read_dirty_data)
-            !s->unaligned_bvec && !s->read_dirty_data)
                bch_data_verify(dc, s->orig_bio);
        bio_complete(s);
@@ -946,7 +945,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
        struct bio *miss, *cache_bio;
        if (s->cache_miss || s->iop.bypass) {
-                miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
+                miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
                ret = miss == bio ? MAP_DONE : MAP_CONTINUE;
                goto out_submit;
        }
@@ -960,7 +959,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
        s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada);
        s->iop.replace_key = KEY(s->iop.inode,
-                                 bio->bi_sector + s->insert_bio_sectors,
+                                 bio->bi_iter.bi_sector + s->insert_bio_sectors,
                                 s->insert_bio_sectors);
        ret = bch_btree_insert_check_key(b, &s->op, &s->iop.replace_key);
@@ -969,7 +968,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
        s->iop.replace = true;
-        miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
+        miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
        /* btree_search_recurse()'s btree iterator is no good anymore */
        ret = miss == bio ? MAP_DONE : -EINTR;
@@ -980,9 +979,9 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
        if (!cache_bio)
                goto out_submit;
-        cache_bio->bi_sector    = miss->bi_sector;
+        cache_bio->bi_iter.bi_sector    = miss->bi_iter.bi_sector;
-        cache_bio->bi_bdev      = miss->bi_bdev;
+        cache_bio->bi_bdev              = miss->bi_bdev;
-        cache_bio->bi_size      = s->insert_bio_sectors << 9;
+        cache_bio->bi_iter.bi_size      = s->insert_bio_sectors << 9;
        cache_bio->bi_end_io    = request_endio;
        cache_bio->bi_private   = &s->cl;
@@ -1032,7 +1031,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 {
        struct closure *cl = &s->cl;
        struct bio *bio = &s->bio.bio;
-        struct bkey start = KEY(dc->disk.id, bio->bi_sector, 0);
+        struct bkey start = KEY(dc->disk.id, bio->bi_iter.bi_sector, 0);
        struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0);
        bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &start, &end);
@@ -1088,8 +1087,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
                        closure_bio_submit(flush, cl, s->d);
                }
        } else {
-                s->iop.bio = bio_clone_bioset(bio, GFP_NOIO,
+                s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split);
-                                              dc->disk.bio_split);
                closure_bio_submit(bio, cl, s->d);
        }
@@ -1127,13 +1125,13 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
        part_stat_unlock();
        bio->bi_bdev = dc->bdev;
-        bio->bi_sector += dc->sb.data_offset;
+        bio->bi_iter.bi_sector += dc->sb.data_offset;
        if (cached_dev_get(dc)) {
                s = search_alloc(bio, d);
                trace_bcache_request_start(s->d, bio);
-                if (!bio->bi_size) {
+                if (!bio->bi_iter.bi_size) {
                        /*
                         * can't call bch_journal_meta from under
                         * generic_make_request
@@ -1205,24 +1203,24 @@ void bch_cached_dev_request_init(struct cached_dev *dc)
 static int flash_dev_cache_miss(struct btree *b, struct search *s,
                                struct bio *bio, unsigned sectors)
 {
-        struct bio_vec *bv;
+        struct bio_vec bv;
-        int i;
+        struct bvec_iter iter;
        /* Zero fill bio */
-        bio_for_each_segment(bv, bio, i) {
+        bio_for_each_segment(bv, bio, iter) {
-                unsigned j = min(bv->bv_len >> 9, sectors);
+                unsigned j = min(bv.bv_len >> 9, sectors);
-                void *p = kmap(bv->bv_page);
+                void *p = kmap(bv.bv_page);
-                memset(p + bv->bv_offset, 0, j << 9);
+                memset(p + bv.bv_offset, 0, j << 9);
-                kunmap(bv->bv_page);
+                kunmap(bv.bv_page);
                sectors -= j;
        }
-        bio_advance(bio, min(sectors << 9, bio->bi_size));
+        bio_advance(bio, min(sectors << 9, bio->bi_iter.bi_size));
-        if (!bio->bi_size)
+        if (!bio->bi_iter.bi_size)
                return MAP_DONE;
        return MAP_CONTINUE;
@@ -1256,7 +1254,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
        trace_bcache_request_start(s->d, bio);
-        if (!bio->bi_size) {
+        if (!bio->bi_iter.bi_size) {
                /*
                 * can't call bch_journal_meta from under
                 * generic_make_request
@@ -1266,7 +1264,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
                                      bcache_wq);
        } else if (rw) {
                bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys,
-                                        &KEY(d->id, bio->bi_sector, 0),
+                                        &KEY(d->id, bio->bi_iter.bi_sector, 0),
                                        &KEY(d->id, bio_end_sector(bio), 0));
                s->iop.bypass           = (bio->bi_rw & REQ_DISCARD) != 0;
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 2cd65bf073c2..39f21dbedc38 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -13,17 +13,22 @@ struct data_insert_op {
        uint16_t                write_prio;
        short                   error;
-        unsigned                bypass:1;
+        union {
-        unsigned                writeback:1;
+                uint16_t        flags;
-        unsigned                flush_journal:1;
-        unsigned                csum:1;
-        unsigned                replace:1;
+        struct {
-        unsigned                replace_collision:1;
+                unsigned        bypass:1;
+                unsigned        writeback:1;
+                unsigned        flush_journal:1;
+                unsigned        csum:1;
-        unsigned                insert_data_done:1;
+                unsigned        replace:1;
+                unsigned        replace_collision:1;
+                unsigned        insert_data_done:1;
+        };
+        };
-        /* Anything past this point won't get zeroed in search_alloc() */
        struct keylist          insert_keys;
        BKEY_PADDED(replace_key);
 };
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index c57bfa071a57..24a3a1546caa 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -9,6 +9,7 @@
 #include "bcache.h"
 #include "btree.h"
 #include "debug.h"
+#include "extents.h"
 #include "request.h"
 #include "writeback.h"
@@ -225,7 +226,7 @@ static void write_bdev_super_endio(struct bio *bio, int error)
        struct cached_dev *dc = bio->bi_private;
        /* XXX: error checking */
-        closure_put(&dc->sb_write.cl);
+        closure_put(&dc->sb_write);
 }
 static void __write_super(struct cache_sb *sb, struct bio *bio)
@@ -233,9 +234,9 @@ static void __write_super(struct cache_sb *sb, struct bio *bio)
        struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page);
        unsigned i;
-        bio->bi_sector  = SB_SECTOR;
+        bio->bi_iter.bi_sector  = SB_SECTOR;
-        bio->bi_rw      = REQ_SYNC|REQ_META;
+        bio->bi_rw              = REQ_SYNC|REQ_META;
-        bio->bi_size    = SB_SIZE;
+        bio->bi_iter.bi_size    = SB_SIZE;
        bch_bio_map(bio, NULL);
        out->offset             = cpu_to_le64(sb->offset);
@@ -263,12 +264,20 @@ static void __write_super(struct cache_sb *sb, struct bio *bio)
        submit_bio(REQ_WRITE, bio);
 }
+static void bch_write_bdev_super_unlock(struct closure *cl)
+{
+        struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
+        up(&dc->sb_write_mutex);
+}
 void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
 {
-        struct closure *cl = &dc->sb_write.cl;
+        struct closure *cl = &dc->sb_write;
        struct bio *bio = &dc->sb_bio;
-        closure_lock(&dc->sb_write, parent);
+        down(&dc->sb_write_mutex);
+        closure_init(cl, parent);
        bio_reset(bio);
        bio->bi_bdev    = dc->bdev;
@@ -278,7 +287,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
        closure_get(cl);
        __write_super(&dc->sb, bio);
-        closure_return(cl);
+        closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
 }
 static void write_super_endio(struct bio *bio, int error)
@@ -286,16 +295,24 @@ static void write_super_endio(struct bio *bio, int error)
        struct cache *ca = bio->bi_private;
        bch_count_io_errors(ca, error, "writing superblock");
-        closure_put(&ca->set->sb_write.cl);
+        closure_put(&ca->set->sb_write);
+}
+static void bcache_write_super_unlock(struct closure *cl)
+{
+        struct cache_set *c = container_of(cl, struct cache_set, sb_write);
+        up(&c->sb_write_mutex);
 }
 void bcache_write_super(struct cache_set *c)
 {
-        struct closure *cl = &c->sb_write.cl;
+        struct closure *cl = &c->sb_write;
        struct cache *ca;
        unsigned i;
-        closure_lock(&c->sb_write, &c->cl);
+        down(&c->sb_write_mutex);
+        closure_init(cl, &c->cl);
        c->sb.seq++;
@@ -317,7 +334,7 @@ void bcache_write_super(struct cache_set *c)
                __write_super(&ca->sb, bio);
        }
-        closure_return(cl);
+        closure_return_with_destructor(cl, bcache_write_super_unlock);
 }
 /* UUID io */
@@ -325,29 +342,37 @@ void bcache_write_super(struct cache_set *c)
 static void uuid_endio(struct bio *bio, int error)
 {
        struct closure *cl = bio->bi_private;
-        struct cache_set *c = container_of(cl, struct cache_set, uuid_write.cl);
+        struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
        cache_set_err_on(error, c, "accessing uuids");
        bch_bbio_free(bio, c);
        closure_put(cl);
 }
+static void uuid_io_unlock(struct closure *cl)
+{
+        struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
+        up(&c->uuid_write_mutex);
+}
 static void uuid_io(struct cache_set *c, unsigned long rw,
                    struct bkey *k, struct closure *parent)
 {
-        struct closure *cl = &c->uuid_write.cl;
+        struct closure *cl = &c->uuid_write;
        struct uuid_entry *u;
        unsigned i;
        char buf[80];
        BUG_ON(!parent);
-        closure_lock(&c->uuid_write, parent);
+        down(&c->uuid_write_mutex);
+        closure_init(cl, parent);
        for (i = 0; i < KEY_PTRS(k); i++) {
                struct bio *bio = bch_bbio_alloc(c);
                bio->bi_rw      = REQ_SYNC|REQ_META|rw;
-                bio->bi_size    = KEY_SIZE(k) << 9;
+                bio->bi_iter.bi_size = KEY_SIZE(k) << 9;
                bio->bi_end_io  = uuid_endio;
                bio->bi_private = cl;
@@ -359,7 +384,7 @@ static void uuid_io(struct cache_set *c, unsigned long rw,
                        break;
        }
-        bch_bkey_to_text(buf, sizeof(buf), k);
+        bch_extent_to_text(buf, sizeof(buf), k);
        pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf);
        for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
@@ -368,14 +393,14 @@ static void uuid_io(struct cache_set *c, unsigned long rw,
                                 u - c->uuids, u->uuid, u->label,
                                 u->first_reg, u->last_reg, u->invalidated);
-        closure_return(cl);
+        closure_return_with_destructor(cl, uuid_io_unlock);
 }
 static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
 {
        struct bkey *k = &j->uuid_bucket;
-        if (bch_btree_ptr_invalid(c, k))
+        if (__bch_btree_ptr_invalid(c, k))
                return "bad uuid pointer";
        bkey_copy(&c->uuid_bucket, k);
@@ -420,7 +445,7 @@ static int __uuid_write(struct cache_set *c)
        lockdep_assert_held(&bch_register_lock);
-        if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true))
+        if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true))
                return 1;
        SET_KEY_SIZE(&k.key, c->sb.bucket_size);
@@ -503,10 +528,10 @@ static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw)
        closure_init_stack(cl);
-        bio->bi_sector  = bucket * ca->sb.bucket_size;
+        bio->bi_iter.bi_sector  = bucket * ca->sb.bucket_size;
-        bio->bi_bdev    = ca->bdev;
+        bio->bi_bdev            = ca->bdev;
-        bio->bi_rw      = REQ_SYNC|REQ_META|rw;
+        bio->bi_rw              = REQ_SYNC|REQ_META|rw;
-        bio->bi_size    = bucket_bytes(ca);
+        bio->bi_iter.bi_size    = bucket_bytes(ca);
        bio->bi_end_io  = prio_endio;
        bio->bi_private = ca;
@@ -538,8 +563,8 @@ void bch_prio_write(struct cache *ca)
        atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
                        &ca->meta_sectors_written);
-        pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
+        //pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
-                 fifo_used(&ca->free_inc), fifo_used(&ca->unused));
+        //       fifo_used(&ca->free_inc), fifo_used(&ca->unused));
        for (i = prio_buckets(ca) - 1; i >= 0; --i) {
                long bucket;
@@ -558,7 +583,7 @@ void bch_prio_write(struct cache *ca)
                p->magic        = pset_magic(&ca->sb);
                p->csum         = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
-                bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, true);
+                bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true);
                BUG_ON(bucket == -1);
                mutex_unlock(&ca->set->bucket_lock);
@@ -739,8 +764,6 @@ static void bcache_device_free(struct bcache_device *d)
        }
        bio_split_pool_free(&d->bio_split_hook);
-        if (d->unaligned_bvec)
-                mempool_destroy(d->unaligned_bvec);
        if (d->bio_split)
                bioset_free(d->bio_split);
        if (is_vmalloc_addr(d->full_dirty_stripes))
@@ -793,8 +816,6 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
                return minor;
        if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
-            !(d->unaligned_bvec = mempool_create_kmalloc_pool(1,
-                                sizeof(struct bio_vec) * BIO_MAX_PAGES)) ||
            bio_split_pool_init(&d->bio_split_hook) ||
            !(d->disk = alloc_disk(1))) {
                ida_simple_remove(&bcache_minor, minor);
@@ -1102,7 +1123,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
        set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
        kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
        INIT_WORK(&dc->detach, cached_dev_detach_finish);
-        closure_init_unlocked(&dc->sb_write);
+        sema_init(&dc->sb_write_mutex, 1);
        INIT_LIST_HEAD(&dc->io_lru);
        spin_lock_init(&dc->io_lock);
        bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
@@ -1114,6 +1135,12 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
                hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
        }
+        dc->disk.stripe_size = q->limits.io_opt >> 9;
+        if (dc->disk.stripe_size)
+                dc->partial_stripes_expensive =
+                        q->limits.raid_partial_stripes_expensive;
        ret = bcache_device_init(&dc->disk, block_size,
                         dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
        if (ret)
@@ -1325,8 +1352,8 @@ static void cache_set_free(struct closure *cl)
                if (ca)
                        kobject_put(&ca->kobj);
+        bch_bset_sort_state_free(&c->sort);
        free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
-        free_pages((unsigned long) c->sort, ilog2(bucket_pages(c)));
        if (c->bio_split)
                bioset_free(c->bio_split);
@@ -1451,21 +1478,17 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
        c->block_bits           = ilog2(sb->block_size);
        c->nr_uuids             = bucket_bytes(c) / sizeof(struct uuid_entry);
-        c->btree_pages          = c->sb.bucket_size / PAGE_SECTORS;
+        c->btree_pages          = bucket_pages(c);
        if (c->btree_pages > BTREE_MAX_PAGES)
                c->btree_pages = max_t(int, c->btree_pages / 4,
                                       BTREE_MAX_PAGES);
-        c->sort_crit_factor = int_sqrt(c->btree_pages);
+        sema_init(&c->sb_write_mutex, 1);
-        closure_init_unlocked(&c->sb_write);
        mutex_init(&c->bucket_lock);
        init_waitqueue_head(&c->try_wait);
        init_waitqueue_head(&c->bucket_wait);
-        closure_init_unlocked(&c->uuid_write);
+        sema_init(&c->uuid_write_mutex, 1);
-        mutex_init(&c->sort_lock);
-        spin_lock_init(&c->sort_time.lock);
        spin_lock_init(&c->btree_gc_time.lock);
        spin_lock_init(&c->btree_split_time.lock);
        spin_lock_init(&c->btree_read_time.lock);
@@ -1493,11 +1516,11 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
                                bucket_pages(c))) ||
            !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
            !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
-            !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) ||
            !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
            bch_journal_alloc(c) ||
            bch_btree_cache_alloc(c) ||
-            bch_open_buckets_alloc(c))
+            bch_open_buckets_alloc(c) ||
+            bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
                goto err;
        c->congested_read_threshold_us  = 2000;
@@ -1553,7 +1576,7 @@ static void run_cache_set(struct cache_set *c)
                k = &j->btree_root;
                err = "bad btree root";
-                if (bch_btree_ptr_invalid(c, k))
+                if (__bch_btree_ptr_invalid(c, k))
                        goto err;
                err = "error reading btree root";
@@ -1747,6 +1770,7 @@ err:
 void bch_cache_release(struct kobject *kobj)
 {
        struct cache *ca = container_of(kobj, struct cache, kobj);
+        unsigned i;
        if (ca->set)
                ca->set->cache[ca->sb.nr_this_dev] = NULL;
@@ -1760,7 +1784,9 @@ void bch_cache_release(struct kobject *kobj)
        free_heap(&ca->heap);
        free_fifo(&ca->unused);
        free_fifo(&ca->free_inc);
-        free_fifo(&ca->free);
+        for (i = 0; i < RESERVE_NR; i++)
+                free_fifo(&ca->free[i]);
        if (ca->sb_bio.bi_inline_vecs[0].bv_page)
                put_page(ca->sb_bio.bi_io_vec[0].bv_page);
@@ -1786,10 +1812,12 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
        ca->journal.bio.bi_max_vecs = 8;
        ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs;
-        free = roundup_pow_of_two(ca->sb.nbuckets) >> 9;
+        free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
-        free = max_t(size_t, free, (prio_buckets(ca) + 8) * 2);
-        if (!init_fifo(&ca->free,       free, GFP_KERNEL) ||
+        if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) ||
+            !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
+            !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
+            !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
            !init_fifo(&ca->free_inc,   free << 2, GFP_KERNEL) ||
            !init_fifo(&ca->unused,     free << 2, GFP_KERNEL) ||
            !init_heap(&ca->heap,       free << 3, GFP_KERNEL) ||
@@ -2034,7 +2062,8 @@ static void bcache_exit(void)
                kobject_put(bcache_kobj);
        if (bcache_wq)
                destroy_workqueue(bcache_wq);
-        unregister_blkdev(bcache_major, "bcache");
+        if (bcache_major)
+                unregister_blkdev(bcache_major, "bcache");
        unregister_reboot_notifier(&reboot);
 }
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index a1f85612f0b3..d8458d477a12 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -102,7 +102,6 @@ rw_attribute(bypass_torture_test);
 rw_attribute(key_merging_disabled);
 rw_attribute(gc_always_rewrite);
 rw_attribute(expensive_debug_checks);
-rw_attribute(freelist_percent);
 rw_attribute(cache_replacement_policy);
 rw_attribute(btree_shrinker_disabled);
 rw_attribute(copy_gc_enabled);
@@ -401,6 +400,48 @@ static struct attribute *bch_flash_dev_files[] = {
 };
 KTYPE(bch_flash_dev);
+struct bset_stats_op {
+        struct btree_op op;
+        size_t nodes;
+        struct bset_stats stats;
+};
+static int btree_bset_stats(struct btree_op *b_op, struct btree *b)
+{
+        struct bset_stats_op *op = container_of(b_op, struct bset_stats_op, op);
+        op->nodes++;
+        bch_btree_keys_stats(&b->keys, &op->stats);
+        return MAP_CONTINUE;
+}
+static int bch_bset_print_stats(struct cache_set *c, char *buf)
+{
+        struct bset_stats_op op;
+        int ret;
+        memset(&op, 0, sizeof(op));
+        bch_btree_op_init(&op.op, -1);
+        ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, btree_bset_stats);
+        if (ret < 0)
+                return ret;
+        return snprintf(buf, PAGE_SIZE,
+                        "btree nodes:           %zu\n"
+                        "written sets:          %zu\n"
+                        "unwritten sets:                %zu\n"
+                        "written key bytes:     %zu\n"
+                        "unwritten key bytes:   %zu\n"
+                        "floats:                        %zu\n"
+                        "failed:                        %zu\n",
+                        op.nodes,
+                        op.stats.sets_written, op.stats.sets_unwritten,
+                        op.stats.bytes_written, op.stats.bytes_unwritten,
+                        op.stats.floats, op.stats.failed);
+}
 SHOW(__bch_cache_set)
 {
        unsigned root_usage(struct cache_set *c)
@@ -419,7 +460,7 @@ lock_root:
                        rw_lock(false, b, b->level);
                } while (b != c->root);
-                for_each_key_filter(b, k, &iter, bch_ptr_bad)
+                for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
                        bytes += bkey_bytes(k);
                rw_unlock(false, b);
@@ -434,7 +475,7 @@ lock_root:
                mutex_lock(&c->bucket_lock);
                list_for_each_entry(b, &c->btree_cache, list)
-                        ret += 1 << (b->page_order + PAGE_SHIFT);
+                        ret += 1 << (b->keys.page_order + PAGE_SHIFT);
                mutex_unlock(&c->bucket_lock);
                return ret;
@@ -491,7 +532,7 @@ lock_root:
        sysfs_print_time_stats(&c->btree_gc_time,       btree_gc, sec, ms);
        sysfs_print_time_stats(&c->btree_split_time,    btree_split, sec, us);
-        sysfs_print_time_stats(&c->sort_time,           btree_sort, ms, us);
+        sysfs_print_time_stats(&c->sort.time,           btree_sort, ms, us);
        sysfs_print_time_stats(&c->btree_read_time,     btree_read, ms, us);
        sysfs_print_time_stats(&c->try_harder_time,     try_harder, ms, us);
@@ -711,9 +752,6 @@ SHOW(__bch_cache)
        sysfs_print(io_errors,
                    atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT);
-        sysfs_print(freelist_percent, ca->free.size * 100 /
-                    ((size_t) ca->sb.nbuckets));
        if (attr == &sysfs_cache_replacement_policy)
                return bch_snprint_string_list(buf, PAGE_SIZE,
                                               cache_replacement_policies,
@@ -820,32 +858,6 @@ STORE(__bch_cache)
                }
        }
-        if (attr == &sysfs_freelist_percent) {
-                DECLARE_FIFO(long, free);
-                long i;
-                size_t p = strtoul_or_return(buf);
-                p = clamp_t(size_t,
-                            ((size_t) ca->sb.nbuckets * p) / 100,
-                            roundup_pow_of_two(ca->sb.nbuckets) >> 9,
-                            ca->sb.nbuckets / 2);
-                if (!init_fifo_exact(&free, p, GFP_KERNEL))
-                        return -ENOMEM;
-                mutex_lock(&ca->set->bucket_lock);
-                fifo_move(&free, &ca->free);
-                fifo_swap(&free, &ca->free);
-                mutex_unlock(&ca->set->bucket_lock);
-                while (fifo_pop(&free, i))
-                        atomic_dec(&ca->buckets[i].pin);
-                free_fifo(&free);
-        }
        if (attr == &sysfs_clear_stats) {
                atomic_long_set(&ca->sectors_written, 0);
                atomic_long_set(&ca->btree_sectors_written, 0);
@@ -869,7 +881,6 @@ static struct attribute *bch_cache_files[] = {
        &sysfs_metadata_written,
        &sysfs_io_errors,
        &sysfs_clear_stats,
-        &sysfs_freelist_percent,
        &sysfs_cache_replacement_policy,
        NULL
 };
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index bb37618e7664..db3ae4c2b223 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -224,10 +224,10 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
 void bch_bio_map(struct bio *bio, void *base)
 {
-        size_t size = bio->bi_size;
+        size_t size = bio->bi_iter.bi_size;
        struct bio_vec *bv = bio->bi_io_vec;
-        BUG_ON(!bio->bi_size);
+        BUG_ON(!bio->bi_iter.bi_size);
        BUG_ON(bio->bi_vcnt);
        bv->bv_offset = base ? ((unsigned long) base) % PAGE_SIZE : 0;
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 1030c6020e98..ac7d0d1f70d7 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -2,6 +2,7 @@
 #ifndef _BCACHE_UTIL_H
 #define _BCACHE_UTIL_H
+#include <linux/blkdev.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/llist.h>
@@ -17,11 +18,13 @@ struct closure;
 #ifdef CONFIG_BCACHE_DEBUG
+#define EBUG_ON(cond)                   BUG_ON(cond)
 #define atomic_dec_bug(v)       BUG_ON(atomic_dec_return(v) < 0)
 #define atomic_inc_bug(v, i)    BUG_ON(atomic_inc_return(v) <= i)
 #else /* DEBUG */
+#define EBUG_ON(cond)                   do { if (cond); } while (0)
 #define atomic_dec_bug(v)       atomic_dec(v)
 #define atomic_inc_bug(v, i)    atomic_inc(v)
@@ -391,6 +394,11 @@ struct time_stats {
 void bch_time_stats_update(struct time_stats *stats, uint64_t time);
+static inline unsigned local_clock_us(void)
+{
+        return local_clock() >> 10;
+}
 #define NSEC_PER_ns                     1L
 #define NSEC_PER_us                     NSEC_PER_USEC
 #define NSEC_PER_ms                     NSEC_PER_MSEC
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 6c44fe059c27..f4300e4c0114 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -111,7 +111,7 @@ static void dirty_init(struct keybuf_key *w)
        if (!io->dc->writeback_percent)
                bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
-        bio->bi_size            = KEY_SIZE(&w->key) << 9;
+        bio->bi_iter.bi_size    = KEY_SIZE(&w->key) << 9;
        bio->bi_max_vecs        = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS);
        bio->bi_private         = w;
        bio->bi_io_vec          = bio->bi_inline_vecs;
@@ -184,7 +184,7 @@ static void write_dirty(struct closure *cl)
        dirty_init(w);
        io->bio.bi_rw           = WRITE;
-        io->bio.bi_sector       = KEY_START(&w->key);
+        io->bio.bi_iter.bi_sector = KEY_START(&w->key);
        io->bio.bi_bdev         = io->dc->bdev;
        io->bio.bi_end_io       = dirty_endio;
@@ -253,7 +253,7 @@ static void read_dirty(struct cached_dev *dc)
                io->dc          = dc;
                dirty_init(w);
-                io->bio.bi_sector       = PTR_OFFSET(&w->key, 0);
+                io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
                io->bio.bi_bdev         = PTR_CACHE(dc->disk.c,
                                                    &w->key, 0)->bdev;
                io->bio.bi_rw           = READ;
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index c9ddcf4614b9..e2f8598937ac 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -50,7 +50,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
                return false;
        if (dc->partial_stripes_expensive &&
-            bcache_dev_stripe_dirty(dc, bio->bi_sector,
+            bcache_dev_stripe_dirty(dc, bio->bi_iter.bi_sector,
                                    bio_sectors(bio)))
                return true;
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 12dc29ba7399..4195a01b1535 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1635,7 +1635,7 @@ int bitmap_create(struct mddev *mddev)
        sector_t blocks = mddev->resync_max_sectors;
        struct file *file = mddev->bitmap_info.file;
        int err;
-        struct sysfs_dirent *bm = NULL;
+        struct kernfs_node *bm = NULL;
        BUILD_BUG_ON(sizeof(bitmap_super_t) != 256);
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index df4aeb6ac6f0..30210b9c4ef9 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -225,7 +225,7 @@ struct bitmap {
        wait_queue_head_t overflow_wait;
        wait_queue_head_t behind_wait;
-        struct sysfs_dirent *sysfs_can_clear;
+        struct kernfs_node *sysfs_can_clear;
 };
 /* the bitmap API */
diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h
index 3a8cfa2645c7..dd3646111561 100644
--- a/drivers/md/dm-bio-record.h
+++ b/drivers/md/dm-bio-record.h
@@ -17,55 +17,24 @@
 * original bio state.
 */
-struct dm_bio_vec_details {
-#if PAGE_SIZE < 65536
-        __u16 bv_len;
-        __u16 bv_offset;
-#else
-        unsigned bv_len;
-        unsigned bv_offset;
-#endif
-};
 struct dm_bio_details {
-        sector_t bi_sector;
        struct block_device *bi_bdev;
-        unsigned int bi_size;
-        unsigned short bi_idx;
        unsigned long bi_flags;
-        struct dm_bio_vec_details bi_io_vec[BIO_MAX_PAGES];
+        struct bvec_iter bi_iter;
 };
 static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio)
 {
-        unsigned i;
-        bd->bi_sector = bio->bi_sector;
        bd->bi_bdev = bio->bi_bdev;
-        bd->bi_size = bio->bi_size;
-        bd->bi_idx = bio->bi_idx;
        bd->bi_flags = bio->bi_flags;
+        bd->bi_iter = bio->bi_iter;
-        for (i = 0; i < bio->bi_vcnt; i++) {
-                bd->bi_io_vec[i].bv_len = bio->bi_io_vec[i].bv_len;
-                bd->bi_io_vec[i].bv_offset = bio->bi_io_vec[i].bv_offset;
-        }
 }
 static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio)
 {
-        unsigned i;
-        bio->bi_sector = bd->bi_sector;
        bio->bi_bdev = bd->bi_bdev;
-        bio->bi_size = bd->bi_size;
-        bio->bi_idx = bd->bi_idx;
        bio->bi_flags = bd->bi_flags;
+        bio->bi_iter = bd->bi_iter;
-        for (i = 0; i < bio->bi_vcnt; i++) {
-                bio->bi_io_vec[i].bv_len = bd->bi_io_vec[i].bv_len;
-                bio->bi_io_vec[i].bv_offset = bd->bi_io_vec[i].bv_offset;
-        }
 }
 #endif
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 54bdd923316f..66c5d130c8c2 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -104,6 +104,8 @@ struct dm_bufio_client {
        struct list_head reserved_buffers;
        unsigned need_reserved_buffers;
+        unsigned minimum_buffers;
        struct hlist_head *cache_hash;
        wait_queue_head_t free_buffer_wait;
@@ -538,7 +540,7 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
        bio_init(&b->bio);
        b->bio.bi_io_vec = b->bio_vec;
        b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS;
-        b->bio.bi_sector = block << b->c->sectors_per_block_bits;
+        b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits;
        b->bio.bi_bdev = b->c->bdev;
        b->bio.bi_end_io = end_io;
@@ -861,8 +863,8 @@ static void __get_memory_limit(struct dm_bufio_client *c,
        buffers = dm_bufio_cache_size_per_client >>
                  (c->sectors_per_block_bits + SECTOR_SHIFT);
-        if (buffers < DM_BUFIO_MIN_BUFFERS)
+        if (buffers < c->minimum_buffers)
-                buffers = DM_BUFIO_MIN_BUFFERS;
+                buffers = c->minimum_buffers;
        *limit_buffers = buffers;
        *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100;
@@ -1350,6 +1352,34 @@ retry:
 }
 EXPORT_SYMBOL_GPL(dm_bufio_release_move);
+/*
+ * Free the given buffer.
+ *
+ * This is just a hint, if the buffer is in use or dirty, this function
+ * does nothing.
+ */
+void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
+{
+        struct dm_buffer *b;
+        dm_bufio_lock(c);
+        b = __find(c, block);
+        if (b && likely(!b->hold_count) && likely(!b->state)) {
+                __unlink_buffer(b);
+                __free_buffer_wake(b);
+        }
+        dm_bufio_unlock(c);
+}
+EXPORT_SYMBOL(dm_bufio_forget);
+void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n)
+{
+        c->minimum_buffers = n;
+}
+EXPORT_SYMBOL(dm_bufio_set_minimum_buffers);
 unsigned dm_bufio_get_block_size(struct dm_bufio_client *c)
 {
        return c->block_size;
@@ -1546,6 +1576,8 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
        INIT_LIST_HEAD(&c->reserved_buffers);
        c->need_reserved_buffers = reserved_buffers;
+        c->minimum_buffers = DM_BUFIO_MIN_BUFFERS;
        init_waitqueue_head(&c->free_buffer_wait);
        c->async_write_error = 0;
diff --git a/drivers/md/dm-bufio.h b/drivers/md/dm-bufio.h
index b142946a9e32..c096779a7292 100644
--- a/drivers/md/dm-bufio.h
+++ b/drivers/md/dm-bufio.h
@@ -108,6 +108,18 @@ int dm_bufio_issue_flush(struct dm_bufio_client *c);
 */
 void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block);
+/*
+ * Free the given buffer.
+ * This is just a hint, if the buffer is in use or dirty, this function
+ * does nothing.
+ */
+void dm_bufio_forget(struct dm_bufio_client *c, sector_t block);
+/*
+ * Set the minimum number of buffers before cleanup happens.
+ */
+void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n);
 unsigned dm_bufio_get_block_size(struct dm_bufio_client *c);
 sector_t dm_bufio_get_device_size(struct dm_bufio_client *c);
 sector_t dm_bufio_get_block_number(struct dm_buffer *b);
diff --git a/drivers/md/dm-builtin.c b/drivers/md/dm-builtin.c
new file mode 100644
index 000000000000..6c9049c51b2b
--- /dev/null
+++ b/drivers/md/dm-builtin.c
@@ -0,0 +1,48 @@
+#include "dm.h"
+/*
+ * The kobject release method must not be placed in the module itself,
+ * otherwise we are subject to module unload races.
+ *
+ * The release method is called when the last reference to the kobject is
+ * dropped. It may be called by any other kernel code that drops the last
+ * reference.
+ *
+ * The release method suffers from module unload race. We may prevent the
+ * module from being unloaded at the start of the release method (using
+ * increased module reference count or synchronizing against the release
+ * method), however there is no way to prevent the module from being
+ * unloaded at the end of the release method.
+ *
+ * If this code were placed in the dm module, the following race may
+ * happen:
+ *  1. Some other process takes a reference to dm kobject
+ *  2. The user issues ioctl function to unload the dm device
+ *  3. dm_sysfs_exit calls kobject_put, however the object is not released
+ *     because of the other reference taken at step 1
+ *  4. dm_sysfs_exit waits on the completion
+ *  5. The other process that took the reference in step 1 drops it,
+ *     dm_kobject_release is called from this process
+ *  6. dm_kobject_release calls complete()
+ *  7. a reschedule happens before dm_kobject_release returns
+ *  8. dm_sysfs_exit continues, the dm device is unloaded, module reference
+ *     count is decremented
+ *  9. The user unloads the dm module
+ * 10. The other process that was rescheduled in step 7 continues to run,
+ *     it is now executing code in unloaded module, so it crashes
+ *
+ * Note that if the process that takes the foreign reference to dm kobject
+ * has a low priority and the system is sufficiently loaded with
+ * higher-priority processes that prevent the low-priority process from
+ * being scheduled long enough, this bug may really happen.
+ *
+ * In order to fix this module unload race, we place the release method
+ * into a helper code that is compiled directly into the kernel.
+ */
+void dm_kobject_release(struct kobject *kobj)
+{
+        complete(dm_get_completion_from_kobject(kobj));
+}
+EXPORT_SYMBOL(dm_kobject_release);
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 64780ad73bb0..0e385e40909e 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -72,7 +72,7 @@ static enum io_pattern iot_pattern(struct io_tracker *t)
 static void iot_update_stats(struct io_tracker *t, struct bio *bio)
 {
-        if (bio->bi_sector == from_oblock(t->last_end_oblock) + 1)
+        if (bio->bi_iter.bi_sector == from_oblock(t->last_end_oblock) + 1)
                t->nr_seq_samples++;
        else {
                /*
@@ -87,7 +87,7 @@ static void iot_update_stats(struct io_tracker *t, struct bio *bio)
                t->nr_rand_samples++;
        }
-        t->last_end_oblock = to_oblock(bio->bi_sector + bio_sectors(bio) - 1);
+        t->last_end_oblock = to_oblock(bio_end_sector(bio) - 1);
 }
 static void iot_check_for_pattern_switch(struct io_tracker *t)
@@ -287,9 +287,8 @@ static struct entry *alloc_entry(struct entry_pool *ep)
 static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock)
 {
        struct entry *e = ep->entries + from_cblock(cblock);
-        list_del(&e->list);
-        INIT_LIST_HEAD(&e->list);
+        list_del_init(&e->list);
        INIT_HLIST_NODE(&e->hlist);
        ep->nr_allocated++;
@@ -391,6 +390,10 @@ struct mq_policy {
         */
        unsigned promote_threshold;
+        unsigned discard_promote_adjustment;
+        unsigned read_promote_adjustment;
+        unsigned write_promote_adjustment;
        /*
         * The hash table allows us to quickly find an entry by origin
         * block.  Both pre_cache and cache entries are in here.
@@ -400,6 +403,10 @@ struct mq_policy {
        struct hlist_head *table;
 };
+#define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1
+#define DEFAULT_READ_PROMOTE_ADJUSTMENT 4
+#define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8
 /*----------------------------------------------------------------*/
 /*
@@ -642,25 +649,21 @@ static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
 * We bias towards reads, since they can be demoted at no cost if they
 * haven't been dirtied.
 */
-#define DISCARDED_PROMOTE_THRESHOLD 1
-#define READ_PROMOTE_THRESHOLD 4
-#define WRITE_PROMOTE_THRESHOLD 8
 static unsigned adjusted_promote_threshold(struct mq_policy *mq,
                                           bool discarded_oblock, int data_dir)
 {
        if (data_dir == READ)
-                return mq->promote_threshold + READ_PROMOTE_THRESHOLD;
+                return mq->promote_threshold + mq->read_promote_adjustment;
        if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) {
                /*
                 * We don't need to do any copying at all, so give this a
                 * very low threshold.
                 */
-                return DISCARDED_PROMOTE_THRESHOLD;
+                return mq->discard_promote_adjustment;
        }
-        return mq->promote_threshold + WRITE_PROMOTE_THRESHOLD;
+        return mq->promote_threshold + mq->write_promote_adjustment;
 }
 static bool should_promote(struct mq_policy *mq, struct entry *e,
@@ -809,7 +812,7 @@ static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock,
                          bool can_migrate, bool discarded_oblock,
                          int data_dir, struct policy_result *result)
 {
-        if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) == 1) {
+        if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) <= 1) {
                if (can_migrate)
                        insert_in_cache(mq, oblock, result);
                else
@@ -869,7 +872,7 @@ static void mq_destroy(struct dm_cache_policy *p)
 {
        struct mq_policy *mq = to_mq_policy(p);
-        kfree(mq->table);
+        vfree(mq->table);
        epool_exit(&mq->cache_pool);
        epool_exit(&mq->pre_cache_pool);
        kfree(mq);
@@ -1135,20 +1138,28 @@ static int mq_set_config_value(struct dm_cache_policy *p,
                               const char *key, const char *value)
 {
        struct mq_policy *mq = to_mq_policy(p);
-        enum io_pattern pattern;
        unsigned long tmp;
-        if (!strcasecmp(key, "random_threshold"))
-                pattern = PATTERN_RANDOM;
-        else if (!strcasecmp(key, "sequential_threshold"))
-                pattern = PATTERN_SEQUENTIAL;
-        else
-                return -EINVAL;
        if (kstrtoul(value, 10, &tmp))
                return -EINVAL;
-        mq->tracker.thresholds[pattern] = tmp;
+        if (!strcasecmp(key, "random_threshold")) {
+                mq->tracker.thresholds[PATTERN_RANDOM] = tmp;
+        } else if (!strcasecmp(key, "sequential_threshold")) {
+                mq->tracker.thresholds[PATTERN_SEQUENTIAL] = tmp;
+        } else if (!strcasecmp(key, "discard_promote_adjustment"))
+                mq->discard_promote_adjustment = tmp;
+        else if (!strcasecmp(key, "read_promote_adjustment"))
+                mq->read_promote_adjustment = tmp;
+        else if (!strcasecmp(key, "write_promote_adjustment"))
+                mq->write_promote_adjustment = tmp;
+        else
+                return -EINVAL;
        return 0;
 }
@@ -1158,9 +1169,16 @@ static int mq_emit_config_values(struct dm_cache_policy *p, char *result, unsign
        ssize_t sz = 0;
        struct mq_policy *mq = to_mq_policy(p);
-        DMEMIT("4 random_threshold %u sequential_threshold %u",
+        DMEMIT("10 random_threshold %u "
+               "sequential_threshold %u "
+               "discard_promote_adjustment %u "
+               "read_promote_adjustment %u "
+               "write_promote_adjustment %u",
               mq->tracker.thresholds[PATTERN_RANDOM],
-               mq->tracker.thresholds[PATTERN_SEQUENTIAL]);
+               mq->tracker.thresholds[PATTERN_SEQUENTIAL],
+               mq->discard_promote_adjustment,
+               mq->read_promote_adjustment,
+               mq->write_promote_adjustment);
        return 0;
 }
@@ -1213,6 +1231,9 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
        mq->hit_count = 0;
        mq->generation = 0;
        mq->promote_threshold = 0;
+        mq->discard_promote_adjustment = DEFAULT_DISCARD_PROMOTE_ADJUSTMENT;
+        mq->read_promote_adjustment = DEFAULT_READ_PROMOTE_ADJUSTMENT;
+        mq->write_promote_adjustment = DEFAULT_WRITE_PROMOTE_ADJUSTMENT;
        mutex_init(&mq->lock);
        spin_lock_init(&mq->tick_lock);
@@ -1224,7 +1245,7 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
        mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16);
        mq->hash_bits = ffs(mq->nr_buckets) - 1;
-        mq->table = kzalloc(sizeof(*mq->table) * mq->nr_buckets, GFP_KERNEL);
+        mq->table = vzalloc(sizeof(*mq->table) * mq->nr_buckets);
        if (!mq->table)
                goto bad_alloc_table;
@@ -1244,7 +1265,7 @@ bad_pre_cache_init:
 static struct dm_cache_policy_type mq_policy_type = {
        .name = "mq",
-        .version = {1, 1, 0},
+        .version = {1, 2, 0},
        .hint_size = 4,
        .owner = THIS_MODULE,
        .create = mq_create
@@ -1252,10 +1273,11 @@ static struct dm_cache_policy_type mq_policy_type = {
 static struct dm_cache_policy_type default_policy_type = {
        .name = "default",
-        .version = {1, 1, 0},
+        .version = {1, 2, 0},
        .hint_size = 4,
        .owner = THIS_MODULE,
-        .create = mq_create
+        .create = mq_create,
+        .real = &mq_policy_type
 };
 static int __init mq_init(void)
diff --git a/drivers/md/dm-cache-policy.c b/drivers/md/dm-cache-policy.c
index d80057968407..c1a3cee99b44 100644
--- a/drivers/md/dm-cache-policy.c
+++ b/drivers/md/dm-cache-policy.c
@@ -146,6 +146,10 @@ const char *dm_cache_policy_get_name(struct dm_cache_policy *p)
 {
        struct dm_cache_policy_type *t = p->private;
+        /* if t->real is set then an alias was used (e.g. "default") */
+        if (t->real)
+                return t->real->name;
        return t->name;
 }
 EXPORT_SYMBOL_GPL(dm_cache_policy_get_name);
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
index 052c00a84a5c..f50fe360c546 100644
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@@ -223,6 +223,12 @@ struct dm_cache_policy_type {
        unsigned version[CACHE_POLICY_VERSION_SIZE];
        /*
+         * For use by an alias dm_cache_policy_type to point to the
+         * real dm_cache_policy_type.
+         */
+        struct dm_cache_policy_type *real;
+        /*
         * Policies may store a hint for each each cache block.
         * Currently the size of this hint must be 0 or 4 bytes but we
         * expect to relax this in future.
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 1b1469ebe5cb..074b9c8e4cf0 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -85,6 +85,12 @@ static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
 {
        bio->bi_end_io = h->bi_end_io;
        bio->bi_private = h->bi_private;
+        /*
+         * Must bump bi_remaining to allow bio to complete with
+         * restored bi_end_io.
+         */
+        atomic_inc(&bio->bi_remaining);
 }
 /*----------------------------------------------------------------*/
@@ -283,6 +289,7 @@ struct per_bio_data {
        bool tick:1;
        unsigned req_nr:2;
        struct dm_deferred_entry *all_io_entry;
+        struct dm_hook_info hook_info;
        /*
         * writethrough fields.  These MUST remain at the end of this
@@ -291,7 +298,6 @@ struct per_bio_data {
         */
        struct cache *cache;
        dm_cblock_t cblock;
-        struct dm_hook_info hook_info;
        struct dm_bio_details bio_details;
 };
@@ -664,15 +670,18 @@ static void remap_to_origin(struct cache *cache, struct bio *bio)
 static void remap_to_cache(struct cache *cache, struct bio *bio,
                           dm_cblock_t cblock)
 {
-        sector_t bi_sector = bio->bi_sector;
+        sector_t bi_sector = bio->bi_iter.bi_sector;
+        sector_t block = from_cblock(cblock);
        bio->bi_bdev = cache->cache_dev->bdev;
        if (!block_size_is_power_of_two(cache))
-                bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) +
+                bio->bi_iter.bi_sector =
-                                sector_div(bi_sector, cache->sectors_per_block);
+                        (block * cache->sectors_per_block) +
+                        sector_div(bi_sector, cache->sectors_per_block);
        else
-                bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) |
+                bio->bi_iter.bi_sector =
-                                (bi_sector & (cache->sectors_per_block - 1));
+                        (block << cache->sectors_per_block_shift) |
+                        (bi_sector & (cache->sectors_per_block - 1));
 }
 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
@@ -712,7 +721,7 @@ static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
 {
-        sector_t block_nr = bio->bi_sector;
+        sector_t block_nr = bio->bi_iter.bi_sector;
        if (!block_size_is_power_of_two(cache))
                (void) sector_div(block_nr, cache->sectors_per_block);
@@ -970,12 +979,13 @@ static void issue_copy_real(struct dm_cache_migration *mg)
        int r;
        struct dm_io_region o_region, c_region;
        struct cache *cache = mg->cache;
+        sector_t cblock = from_cblock(mg->cblock);
        o_region.bdev = cache->origin_dev->bdev;
        o_region.count = cache->sectors_per_block;
        c_region.bdev = cache->cache_dev->bdev;
-        c_region.sector = from_cblock(mg->cblock) * cache->sectors_per_block;
+        c_region.sector = cblock * cache->sectors_per_block;
        c_region.count = cache->sectors_per_block;
        if (mg->writeback || mg->demote) {
@@ -1002,13 +1012,15 @@ static void overwrite_endio(struct bio *bio, int err)
        struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
        unsigned long flags;
+        dm_unhook_bio(&pb->hook_info, bio);
        if (err)
                mg->err = true;
+        mg->requeue_holder = false;
        spin_lock_irqsave(&cache->lock, flags);
        list_add_tail(&mg->list, &cache->completed_migrations);
-        dm_unhook_bio(&pb->hook_info, bio);
-        mg->requeue_holder = false;
        spin_unlock_irqrestore(&cache->lock, flags);
        wake_worker(cache);
@@ -1027,7 +1039,7 @@ static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
 {
        return (bio_data_dir(bio) == WRITE) &&
-                (bio->bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
+                (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
 }
 static void avoid_copy(struct dm_cache_migration *mg)
@@ -1252,7 +1264,7 @@ static void process_flush_bio(struct cache *cache, struct bio *bio)
        size_t pb_data_size = get_per_bio_data_size(cache);
        struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
-        BUG_ON(bio->bi_size);
+        BUG_ON(bio->bi_iter.bi_size);
        if (!pb->req_nr)
                remap_to_origin(cache, bio);
        else
@@ -1275,9 +1287,9 @@ static void process_flush_bio(struct cache *cache, struct bio *bio)
 */
 static void process_discard_bio(struct cache *cache, struct bio *bio)
 {
-        dm_block_t start_block = dm_sector_div_up(bio->bi_sector,
+        dm_block_t start_block = dm_sector_div_up(bio->bi_iter.bi_sector,
                                                  cache->discard_block_size);
-        dm_block_t end_block = bio->bi_sector + bio_sectors(bio);
+        dm_block_t end_block = bio_end_sector(bio);
        dm_block_t b;
        end_block = block_div(end_block, cache->discard_block_size);
@@ -2453,20 +2465,18 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
        bool discarded_block;
        struct dm_bio_prison_cell *cell;
        struct policy_result lookup_result;
-        struct per_bio_data *pb;
+        struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
-        if (from_oblock(block) > from_oblock(cache->origin_blocks)) {
+        if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
                /*
                 * This can only occur if the io goes to a partial block at
                 * the end of the origin device.  We don't cache these.
                 * Just remap to the origin and carry on.
                 */
-                remap_to_origin_clear_discard(cache, bio, block);
+                remap_to_origin(cache, bio);
                return DM_MAPIO_REMAPPED;
        }
-        pb = init_per_bio_data(bio, pb_data_size);
        if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
                defer_bio(cache, bio);
                return DM_MAPIO_SUBMITTED;
@@ -2826,12 +2836,13 @@ static void cache_resume(struct dm_target *ti)
 /*
 * Status format:
 *
- * <#used metadata blocks>/<#total metadata blocks>
+ * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
+ * <cache block size> <#used cache blocks>/<#total cache blocks>
 * <#read hits> <#read misses> <#write hits> <#write misses>
- * <#demotions> <#promotions> <#blocks in cache> <#dirty>
+ * <#demotions> <#promotions> <#dirty>
 * <#features> <features>*
 * <#core args> <core args>
- * <#policy args> <policy args>*
+ * <policy name> <#policy args> <policy args>*
 */
 static void cache_status(struct dm_target *ti, status_type_t type,
                         unsigned status_flags, char *result, unsigned maxlen)
@@ -2869,17 +2880,20 @@ static void cache_status(struct dm_target *ti, status_type_t type,
                residency = policy_residency(cache->policy);
-                DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ",
+                DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %llu ",
+                       (unsigned)(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT),
                       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
                       (unsigned long long)nr_blocks_metadata,
+                       cache->sectors_per_block,
+                       (unsigned long long) from_cblock(residency),
+                       (unsigned long long) from_cblock(cache->cache_size),
                       (unsigned) atomic_read(&cache->stats.read_hit),
                       (unsigned) atomic_read(&cache->stats.read_miss),
                       (unsigned) atomic_read(&cache->stats.write_hit),
                       (unsigned) atomic_read(&cache->stats.write_miss),
                       (unsigned) atomic_read(&cache->stats.demotion),
                       (unsigned) atomic_read(&cache->stats.promotion),
-                       (unsigned long long) from_cblock(residency),
+                       (unsigned long long) from_cblock(cache->nr_dirty));
-                       cache->nr_dirty);
                if (writethrough_mode(&cache->features))
                        DMEMIT("1 writethrough ");
@@ -2896,6 +2910,8 @@ static void cache_status(struct dm_target *ti, status_type_t type,
                }
                DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
+                DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
                if (sz < maxlen) {
                        r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
                        if (r)
@@ -3129,7 +3145,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
 static struct target_type cache_target = {
        .name = "cache",
-        .version = {1, 2, 0},
+        .version = {1, 3, 0},
        .module = THIS_MODULE,
        .ctr = cache_ctr,
        .dtr = cache_dtr,
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 81b0fa660452..784695d22fde 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -39,10 +39,8 @@ struct convert_context {
        struct completion restart;
        struct bio *bio_in;
        struct bio *bio_out;
-        unsigned int offset_in;
+        struct bvec_iter iter_in;
-        unsigned int offset_out;
+        struct bvec_iter iter_out;
-        unsigned int idx_in;
-        unsigned int idx_out;
        sector_t cc_sector;
        atomic_t cc_pending;
 };
@@ -826,10 +824,10 @@ static void crypt_convert_init(struct crypt_config *cc,
 {
        ctx->bio_in = bio_in;
        ctx->bio_out = bio_out;
-        ctx->offset_in = 0;
+        if (bio_in)
-        ctx->offset_out = 0;
+                ctx->iter_in = bio_in->bi_iter;
-        ctx->idx_in = bio_in ? bio_in->bi_idx : 0;
+        if (bio_out)
-        ctx->idx_out = bio_out ? bio_out->bi_idx : 0;
+                ctx->iter_out = bio_out->bi_iter;
        ctx->cc_sector = sector + cc->iv_offset;
        init_completion(&ctx->restart);
 }
@@ -857,8 +855,8 @@ static int crypt_convert_block(struct crypt_config *cc,
                               struct convert_context *ctx,
                               struct ablkcipher_request *req)
 {
-        struct bio_vec *bv_in = bio_iovec_idx(ctx->bio_in, ctx->idx_in);
+        struct bio_vec bv_in = bio_iter_iovec(ctx->bio_in, ctx->iter_in);
-        struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out);
+        struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out);
        struct dm_crypt_request *dmreq;
        u8 *iv;
        int r;
@@ -869,24 +867,15 @@ static int crypt_convert_block(struct crypt_config *cc,
        dmreq->iv_sector = ctx->cc_sector;
        dmreq->ctx = ctx;
        sg_init_table(&dmreq->sg_in, 1);
-        sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT,
+        sg_set_page(&dmreq->sg_in, bv_in.bv_page, 1 << SECTOR_SHIFT,
-                    bv_in->bv_offset + ctx->offset_in);
+                    bv_in.bv_offset);
        sg_init_table(&dmreq->sg_out, 1);
-        sg_set_page(&dmreq->sg_out, bv_out->bv_page, 1 << SECTOR_SHIFT,
+        sg_set_page(&dmreq->sg_out, bv_out.bv_page, 1 << SECTOR_SHIFT,
-                    bv_out->bv_offset + ctx->offset_out);
+                    bv_out.bv_offset);
-        ctx->offset_in += 1 << SECTOR_SHIFT;
+        bio_advance_iter(ctx->bio_in, &ctx->iter_in, 1 << SECTOR_SHIFT);
-        if (ctx->offset_in >= bv_in->bv_len) {
+        bio_advance_iter(ctx->bio_out, &ctx->iter_out, 1 << SECTOR_SHIFT);
-                ctx->offset_in = 0;
-                ctx->idx_in++;
-        }
-        ctx->offset_out += 1 << SECTOR_SHIFT;
-        if (ctx->offset_out >= bv_out->bv_len) {
-                ctx->offset_out = 0;
-                ctx->idx_out++;
-        }
        if (cc->iv_gen_ops) {
                r = cc->iv_gen_ops->generator(cc, iv, dmreq);
@@ -937,8 +926,7 @@ static int crypt_convert(struct crypt_config *cc,
        atomic_set(&ctx->cc_pending, 1);
-        while(ctx->idx_in < ctx->bio_in->bi_vcnt &&
+        while (ctx->iter_in.bi_size && ctx->iter_out.bi_size) {
-              ctx->idx_out < ctx->bio_out->bi_vcnt) {
                crypt_alloc_req(cc, ctx);
@@ -1021,7 +1009,7 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size,
                size -= len;
        }
-        if (!clone->bi_size) {
+        if (!clone->bi_iter.bi_size) {
                bio_put(clone);
                return NULL;
        }
@@ -1161,7 +1149,7 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
        crypt_inc_pending(io);
        clone_init(io, clone);
-        clone->bi_sector = cc->start + io->sector;
+        clone->bi_iter.bi_sector = cc->start + io->sector;
        generic_make_request(clone);
        return 0;
@@ -1207,9 +1195,9 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
        }
        /* crypt_convert should have filled the clone bio */
-        BUG_ON(io->ctx.idx_out < clone->bi_vcnt);
+        BUG_ON(io->ctx.iter_out.bi_size);
-        clone->bi_sector = cc->start + io->sector;
+        clone->bi_iter.bi_sector = cc->start + io->sector;
        if (async)
                kcryptd_queue_io(io);
@@ -1224,7 +1212,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
        struct dm_crypt_io *new_io;
        int crypt_finished;
        unsigned out_of_pages = 0;
-        unsigned remaining = io->base_bio->bi_size;
+        unsigned remaining = io->base_bio->bi_iter.bi_size;
        sector_t sector = io->sector;
        int r;
@@ -1246,9 +1234,9 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
                }
                io->ctx.bio_out = clone;
-                io->ctx.idx_out = 0;
+                io->ctx.iter_out = clone->bi_iter;
-                remaining -= clone->bi_size;
+                remaining -= clone->bi_iter.bi_size;
                sector += bio_sectors(clone);
                crypt_inc_pending(io);
@@ -1290,8 +1278,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
                        crypt_inc_pending(new_io);
                        crypt_convert_init(cc, &new_io->ctx, NULL,
                                           io->base_bio, sector);
-                        new_io->ctx.idx_in = io->ctx.idx_in;
+                        new_io->ctx.iter_in = io->ctx.iter_in;
-                        new_io->ctx.offset_in = io->ctx.offset_in;
                        /*
                         * Fragments after the first use the base_io
@@ -1869,11 +1856,12 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
        if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) {
                bio->bi_bdev = cc->dev->bdev;
                if (bio_sectors(bio))
-                        bio->bi_sector = cc->start + dm_target_offset(ti, bio->bi_sector);
+                        bio->bi_iter.bi_sector = cc->start +
+                                dm_target_offset(ti, bio->bi_iter.bi_sector);
                return DM_MAPIO_REMAPPED;
        }
-        io = crypt_io_alloc(cc, bio, dm_target_offset(ti, bio->bi_sector));
+        io = crypt_io_alloc(cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
        if (bio_data_dir(io->base_bio) == READ) {
                if (kcryptd_io_read(io, GFP_NOWAIT))
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 2f91d6d4a2cc..42c3a27a14cc 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -24,7 +24,6 @@ struct delay_c {
        struct work_struct flush_expired_bios;
        struct list_head delayed_bios;
        atomic_t may_delay;
-        mempool_t *delayed_pool;
        struct dm_dev *dev_read;
        sector_t start_read;
@@ -40,14 +39,11 @@ struct delay_c {
 struct dm_delay_info {
        struct delay_c *context;
        struct list_head list;
-        struct bio *bio;
        unsigned long expires;
 };
 static DEFINE_MUTEX(delayed_bios_lock);
-static struct kmem_cache *delayed_cache;
 static void handle_delayed_timer(unsigned long data)
 {
        struct delay_c *dc = (struct delay_c *)data;
@@ -87,13 +83,14 @@ static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all)
        mutex_lock(&delayed_bios_lock);
        list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) {
                if (flush_all || time_after_eq(jiffies, delayed->expires)) {
+                        struct bio *bio = dm_bio_from_per_bio_data(delayed,
+                                                sizeof(struct dm_delay_info));
                        list_del(&delayed->list);
-                        bio_list_add(&flush_bios, delayed->bio);
+                        bio_list_add(&flush_bios, bio);
-                        if ((bio_data_dir(delayed->bio) == WRITE))
+                        if ((bio_data_dir(bio) == WRITE))
                                delayed->context->writes--;
                        else
                                delayed->context->reads--;
-                        mempool_free(delayed, dc->delayed_pool);
                        continue;
                }
@@ -185,12 +182,6 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        }
 out:
-        dc->delayed_pool = mempool_create_slab_pool(128, delayed_cache);
-        if (!dc->delayed_pool) {
-                DMERR("Couldn't create delayed bio pool.");
-                goto bad_dev_write;
-        }
        dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
        if (!dc->kdelayd_wq) {
                DMERR("Couldn't start kdelayd");
@@ -206,12 +197,11 @@ out:
        ti->num_flush_bios = 1;
        ti->num_discard_bios = 1;
+        ti->per_bio_data_size = sizeof(struct dm_delay_info);
        ti->private = dc;
        return 0;
 bad_queue:
-        mempool_destroy(dc->delayed_pool);
-bad_dev_write:
        if (dc->dev_write)
                dm_put_device(ti, dc->dev_write);
 bad_dev_read:
@@ -232,7 +222,6 @@ static void delay_dtr(struct dm_target *ti)
        if (dc->dev_write)
                dm_put_device(ti, dc->dev_write);
-        mempool_destroy(dc->delayed_pool);
        kfree(dc);
 }
@@ -244,10 +233,9 @@ static int delay_bio(struct delay_c *dc, int delay, struct bio *bio)
        if (!delay || !atomic_read(&dc->may_delay))
                return 1;
-        delayed = mempool_alloc(dc->delayed_pool, GFP_NOIO);
+        delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info));
        delayed->context = dc;
-        delayed->bio = bio;
        delayed->expires = expires = jiffies + (delay * HZ / 1000);
        mutex_lock(&delayed_bios_lock);
@@ -289,14 +277,15 @@ static int delay_map(struct dm_target *ti, struct bio *bio)
        if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) {
                bio->bi_bdev = dc->dev_write->bdev;
                if (bio_sectors(bio))
-                        bio->bi_sector = dc->start_write +
+                        bio->bi_iter.bi_sector = dc->start_write +
-                                         dm_target_offset(ti, bio->bi_sector);
+                                dm_target_offset(ti, bio->bi_iter.bi_sector);
                return delay_bio(dc, dc->write_delay, bio);
        }
        bio->bi_bdev = dc->dev_read->bdev;
-        bio->bi_sector = dc->start_read + dm_target_offset(ti, bio->bi_sector);
+        bio->bi_iter.bi_sector = dc->start_read +
+                dm_target_offset(ti, bio->bi_iter.bi_sector);
        return delay_bio(dc, dc->read_delay, bio);
 }
@@ -356,13 +345,7 @@ static struct target_type delay_target = {
 static int __init dm_delay_init(void)
 {
-        int r = -ENOMEM;
+        int r;
-        delayed_cache = KMEM_CACHE(dm_delay_info, 0);
-        if (!delayed_cache) {
-                DMERR("Couldn't create delayed bio cache.");
-                goto bad_memcache;
-        }
        r = dm_register_target(&delay_target);
        if (r < 0) {
@@ -373,15 +356,12 @@ static int __init dm_delay_init(void)
        return 0;
 bad_register:
-        kmem_cache_destroy(delayed_cache);
-bad_memcache:
        return r;
 }
 static void __exit dm_delay_exit(void)
 {
        dm_unregister_target(&delay_target);
-        kmem_cache_destroy(delayed_cache);
 }
 /* Module hooks */
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index c80a0ec5f126..b257e46876d3 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -248,7 +248,8 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
        bio->bi_bdev = fc->dev->bdev;
        if (bio_sectors(bio))
-                bio->bi_sector = flakey_map_sector(ti, bio->bi_sector);
+                bio->bi_iter.bi_sector =
+                        flakey_map_sector(ti, bio->bi_iter.bi_sector);
 }
 static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc)
@@ -265,8 +266,8 @@ static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc)
                DMDEBUG("Corrupting data bio=%p by writing %u to byte %u "
                        "(rw=%c bi_rw=%lu bi_sector=%llu cur_bytes=%u)\n",
                        bio, fc->corrupt_bio_value, fc->corrupt_bio_byte,
-                        (bio_data_dir(bio) == WRITE) ? 'w' : 'r',
+                        (bio_data_dir(bio) == WRITE) ? 'w' : 'r', bio->bi_rw,
-                        bio->bi_rw, (unsigned long long)bio->bi_sector, bio_bytes);
+                        (unsigned long long)bio->bi_iter.bi_sector, bio_bytes);
        }
 }
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 2a20986a2fec..3842ac738f98 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -201,26 +201,28 @@ static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offse
 /*
 * Functions for getting the pages from a bvec.
 */
-static void bvec_get_page(struct dpages *dp,
+static void bio_get_page(struct dpages *dp, struct page **p,
-                  struct page **p, unsigned long *len, unsigned *offset)
+                         unsigned long *len, unsigned *offset)
 {
-        struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr;
+        struct bio_vec *bvec = dp->context_ptr;
        *p = bvec->bv_page;
-        *len = bvec->bv_len;
+        *len = bvec->bv_len - dp->context_u;
-        *offset = bvec->bv_offset;
+        *offset = bvec->bv_offset + dp->context_u;
 }
-static void bvec_next_page(struct dpages *dp)
+static void bio_next_page(struct dpages *dp)
 {
-        struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr;
+        struct bio_vec *bvec = dp->context_ptr;
        dp->context_ptr = bvec + 1;
+        dp->context_u = 0;
 }
-static void bvec_dp_init(struct dpages *dp, struct bio_vec *bvec)
+static void bio_dp_init(struct dpages *dp, struct bio *bio)
 {
-        dp->get_page = bvec_get_page;
+        dp->get_page = bio_get_page;
-        dp->next_page = bvec_next_page;
+        dp->next_page = bio_next_page;
-        dp->context_ptr = bvec;
+        dp->context_ptr = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+        dp->context_u = bio->bi_iter.bi_bvec_done;
 }
 /*
@@ -304,14 +306,14 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
                                          dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT)));
                bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
-                bio->bi_sector = where->sector + (where->count - remaining);
+                bio->bi_iter.bi_sector = where->sector + (where->count - remaining);
                bio->bi_bdev = where->bdev;
                bio->bi_end_io = endio;
                store_io_and_region_in_bio(bio, io, region);
                if (rw & REQ_DISCARD) {
                        num_sectors = min_t(sector_t, q->limits.max_discard_sectors, remaining);
-                        bio->bi_size = num_sectors << SECTOR_SHIFT;
+                        bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT;
                        remaining -= num_sectors;
                } else if (rw & REQ_WRITE_SAME) {
                        /*
@@ -320,7 +322,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
                        dp->get_page(dp, &page, &len, &offset);
                        bio_add_page(bio, page, logical_block_size, offset);
                        num_sectors = min_t(sector_t, q->limits.max_write_same_sectors, remaining);
-                        bio->bi_size = num_sectors << SECTOR_SHIFT;
+                        bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT;
                        offset = 0;
                        remaining -= num_sectors;
@@ -457,8 +459,8 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp,
                list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset);
                break;
-        case DM_IO_BVEC:
+        case DM_IO_BIO:
-                bvec_dp_init(dp, io_req->mem.ptr.bvec);
+                bio_dp_init(dp, io_req->mem.ptr.bio);
                break;
        case DM_IO_VMA:
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 4f99d267340c..53e848c10939 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -85,7 +85,8 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio)
        bio->bi_bdev = lc->dev->bdev;
        if (bio_sectors(bio))
-                bio->bi_sector = linear_map_sector(ti, bio->bi_sector);
+                bio->bi_iter.bi_sector =
+                        linear_map_sector(ti, bio->bi_iter.bi_sector);
 }
 static int linear_map(struct dm_target *ti, struct bio *bio)
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 9429159d9ee3..b953db6cc229 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -10,10 +10,11 @@
 #include <linux/device-mapper.h>
 #include <linux/dm-log-userspace.h>
 #include <linux/module.h>
+#include <linux/workqueue.h>
 #include "dm-log-userspace-transfer.h"
-#define DM_LOG_USERSPACE_VSN "1.1.0"
+#define DM_LOG_USERSPACE_VSN "1.3.0"
 struct flush_entry {
        int type;
@@ -58,6 +59,18 @@ struct log_c {
        spinlock_t flush_lock;
        struct list_head mark_list;
        struct list_head clear_list;
+        /*
+         * Workqueue for flush of clear region requests.
+         */
+        struct workqueue_struct *dmlog_wq;
+        struct delayed_work flush_log_work;
+        atomic_t sched_flush;
+        /*
+         * Combine userspace flush and mark requests for efficiency.
+         */
+        uint32_t integrated_flush;
 };
 static mempool_t *flush_entry_pool;
@@ -122,6 +135,9 @@ static int build_constructor_string(struct dm_target *ti,
        *ctr_str = NULL;
+        /*
+         * Determine overall size of the string.
+         */
        for (i = 0, str_size = 0; i < argc; i++)
                str_size += strlen(argv[i]) + 1; /* +1 for space between args */
@@ -141,18 +157,39 @@ static int build_constructor_string(struct dm_target *ti,
        return str_size;
 }
+static void do_flush(struct work_struct *work)
+{
+        int r;
+        struct log_c *lc = container_of(work, struct log_c, flush_log_work.work);
+        atomic_set(&lc->sched_flush, 0);
+        r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, NULL, 0, NULL, NULL);
+        if (r)
+                dm_table_event(lc->ti->table);
+}
 /*
 * userspace_ctr
 *
 * argv contains:
- *      <UUID> <other args>
+ *      <UUID> [integrated_flush] <other args>
- * Where 'other args' is the userspace implementation specific log
+ * Where 'other args' are the userspace implementation-specific log
- * arguments.  An example might be:
+ * arguments.
- *      <UUID> clustered-disk <arg count> <log dev> <region_size> [[no]sync]
+ *
+ * Example:
+ *      <UUID> [integrated_flush] clustered-disk <arg count> <log dev>
+ *      <region_size> [[no]sync]
+ *
+ * This module strips off the <UUID> and uses it for identification
+ * purposes when communicating with userspace about a log.
 *
- * So, this module will strip off the <UUID> for identification purposes
+ * If integrated_flush is defined, the kernel combines flush
- * when communicating with userspace about a log; but will pass on everything
+ * and mark requests.
- * else.
+ *
+ * The rest of the line, beginning with 'clustered-disk', is passed
+ * to the userspace ctr function.
 */
 static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
                         unsigned argc, char **argv)
@@ -188,12 +225,22 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
                return -EINVAL;
        }
+        lc->usr_argc = argc;
        strncpy(lc->uuid, argv[0], DM_UUID_LEN);
+        argc--;
+        argv++;
        spin_lock_init(&lc->flush_lock);
        INIT_LIST_HEAD(&lc->mark_list);
        INIT_LIST_HEAD(&lc->clear_list);
-        str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str);
+        if (!strcasecmp(argv[0], "integrated_flush")) {
+                lc->integrated_flush = 1;
+                argc--;
+                argv++;
+        }
+        str_size = build_constructor_string(ti, argc, argv, &ctr_str);
        if (str_size < 0) {
                kfree(lc);
                return str_size;
@@ -246,6 +293,19 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
                        DMERR("Failed to register %s with device-mapper",
                              devices_rdata);
        }
+        if (lc->integrated_flush) {
+                lc->dmlog_wq = alloc_workqueue("dmlogd", WQ_MEM_RECLAIM, 0);
+                if (!lc->dmlog_wq) {
+                        DMERR("couldn't start dmlogd");
+                        r = -ENOMEM;
+                        goto out;
+                }
+                INIT_DELAYED_WORK(&lc->flush_log_work, do_flush);
+                atomic_set(&lc->sched_flush, 0);
+        }
 out:
        kfree(devices_rdata);
        if (r) {
@@ -253,7 +313,6 @@ out:
                kfree(ctr_str);
        } else {
                lc->usr_argv_str = ctr_str;
-                lc->usr_argc = argc;
                log->context = lc;
        }
@@ -264,9 +323,16 @@ static void userspace_dtr(struct dm_dirty_log *log)
 {
        struct log_c *lc = log->context;
+        if (lc->integrated_flush) {
+                /* flush workqueue */
+                if (atomic_read(&lc->sched_flush))
+                        flush_delayed_work(&lc->flush_log_work);
+                destroy_workqueue(lc->dmlog_wq);
+        }
        (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
-                                 NULL, 0,
+                                    NULL, 0, NULL, NULL);
-                                 NULL, NULL);
        if (lc->log_dev)
                dm_put_device(lc->ti, lc->log_dev);
@@ -283,8 +349,7 @@ static int userspace_presuspend(struct dm_dirty_log *log)
        struct log_c *lc = log->context;
        r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_PRESUSPEND,
-                                 NULL, 0,
+                                 NULL, 0, NULL, NULL);
-                                 NULL, NULL);
        return r;
 }
@@ -294,9 +359,14 @@ static int userspace_postsuspend(struct dm_dirty_log *log)
        int r;
        struct log_c *lc = log->context;
+        /*
+         * Run planned flush earlier.
+         */
+        if (lc->integrated_flush && atomic_read(&lc->sched_flush))
+                flush_delayed_work(&lc->flush_log_work);
        r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_POSTSUSPEND,
-                                 NULL, 0,
+                                 NULL, 0, NULL, NULL);
-                                 NULL, NULL);
        return r;
 }
@@ -308,8 +378,7 @@ static int userspace_resume(struct dm_dirty_log *log)
        lc->in_sync_hint = 0;
        r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_RESUME,
-                                 NULL, 0,
+                                 NULL, 0, NULL, NULL);
-                                 NULL, NULL);
        return r;
 }
@@ -405,7 +474,8 @@ static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list)
        return r;
 }
-static int flush_by_group(struct log_c *lc, struct list_head *flush_list)
+static int flush_by_group(struct log_c *lc, struct list_head *flush_list,
+                          int flush_with_payload)
 {
        int r = 0;
        int count;
@@ -431,15 +501,29 @@ static int flush_by_group(struct log_c *lc, struct list_head *flush_list)
                                break;
                }
-                r = userspace_do_request(lc, lc->uuid, type,
+                if (flush_with_payload) {
-                                         (char *)(group),
+                        r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
-                                         count * sizeof(uint64_t),
+                                                 (char *)(group),
-                                         NULL, NULL);
+                                                 count * sizeof(uint64_t),
-                if (r) {
+                                                 NULL, NULL);
-                        /* Group send failed.  Attempt one-by-one. */
+                        /*
-                        list_splice_init(&tmp_list, flush_list);
+                         * Integrated flush failed.
-                        r = flush_one_by_one(lc, flush_list);
+                         */
-                        break;
+                        if (r)
+                                break;
+                } else {
+                        r = userspace_do_request(lc, lc->uuid, type,
+                                                 (char *)(group),
+                                                 count * sizeof(uint64_t),
+                                                 NULL, NULL);
+                        if (r) {
+                                /*
+                                 * Group send failed.  Attempt one-by-one.
+                                 */
+                                list_splice_init(&tmp_list, flush_list);
+                                r = flush_one_by_one(lc, flush_list);
+                                break;
+                        }
                }
        }
@@ -476,6 +560,8 @@ static int userspace_flush(struct dm_dirty_log *log)
        struct log_c *lc = log->context;
        LIST_HEAD(mark_list);
        LIST_HEAD(clear_list);
+        int mark_list_is_empty;
+        int clear_list_is_empty;
        struct flush_entry *fe, *tmp_fe;
        spin_lock_irqsave(&lc->flush_lock, flags);
@@ -483,23 +569,51 @@ static int userspace_flush(struct dm_dirty_log *log)
        list_splice_init(&lc->clear_list, &clear_list);
        spin_unlock_irqrestore(&lc->flush_lock, flags);
-        if (list_empty(&mark_list) && list_empty(&clear_list))
+        mark_list_is_empty = list_empty(&mark_list);
+        clear_list_is_empty = list_empty(&clear_list);
+        if (mark_list_is_empty && clear_list_is_empty)
                return 0;
-        r = flush_by_group(lc, &mark_list);
+        r = flush_by_group(lc, &clear_list, 0);
        if (r)
-                goto fail;
+                goto out;
-        r = flush_by_group(lc, &clear_list);
+        if (!lc->integrated_flush) {
+                r = flush_by_group(lc, &mark_list, 0);
+                if (r)
+                        goto out;
+                r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
+                                         NULL, 0, NULL, NULL);
+                goto out;
+        }
+        /*
+         * Send integrated flush request with mark_list as payload.
+         */
+        r = flush_by_group(lc, &mark_list, 1);
        if (r)
-                goto fail;
+                goto out;
-        r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
+        if (mark_list_is_empty && !atomic_read(&lc->sched_flush)) {
-                                 NULL, 0, NULL, NULL);
+                /*
+                 * When there are only clear region requests,
+                 * we schedule a flush in the future.
+                 */
+                queue_delayed_work(lc->dmlog_wq, &lc->flush_log_work, 3 * HZ);
+                atomic_set(&lc->sched_flush, 1);
+        } else {
+                /*
+                 * Cancel pending flush because we
+                 * have already flushed in mark_region.
+                 */
+                cancel_delayed_work(&lc->flush_log_work);
+                atomic_set(&lc->sched_flush, 0);
+        }
-fail:
+out:
        /*
-         * We can safely remove these entries, even if failure.
+         * We can safely remove these entries, even after failure.
         * Calling code will receive an error and will know that
         * the log facility has failed.
         */
@@ -603,8 +717,7 @@ static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region)
        rdata_size = sizeof(pkg);
        r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK,
-                                 NULL, 0,
+                                 NULL, 0, (char *)&pkg, &rdata_size);
-                                 (char *)&pkg, &rdata_size);
        *region = pkg.r;
        return (r) ? r : (int)pkg.i;
@@ -630,8 +743,7 @@ static void userspace_set_region_sync(struct dm_dirty_log *log,
        pkg.i = (int64_t)in_sync;
        r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
-                                 (char *)&pkg, sizeof(pkg),
+                                 (char *)&pkg, sizeof(pkg), NULL, NULL);
-                                 NULL, NULL);
        /*
         * It would be nice to be able to report failures.
@@ -657,8 +769,7 @@ static region_t userspace_get_sync_count(struct dm_dirty_log *log)
        rdata_size = sizeof(sync_count);
        r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT,
-                                 NULL, 0,
+                                 NULL, 0, (char *)&sync_count, &rdata_size);
-                                 (char *)&sync_count, &rdata_size);
        if (r)
                return 0;
@@ -685,8 +796,7 @@ static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
        switch (status_type) {
        case STATUSTYPE_INFO:
                r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO,
-                                         NULL, 0,
+                                         NULL, 0, result, &sz);
-                                         result, &sz);
                if (r) {
                        sz = 0;
@@ -699,8 +809,10 @@ static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
                BUG_ON(!table_args); /* There will always be a ' ' */
                table_args++;
-                DMEMIT("%s %u %s %s ", log->type->name, lc->usr_argc,
+                DMEMIT("%s %u %s ", log->type->name, lc->usr_argc, lc->uuid);
-                       lc->uuid, table_args);
+                if (lc->integrated_flush)
+                        DMEMIT("integrated_flush ");
+                DMEMIT("%s ", table_args);
                break;
        }
        return (r) ? 0 : (int)sz;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 6eb9dc9ef8f3..422a9fdeb53e 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1626,8 +1626,11 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
        /*
         * Only pass ioctls through if the device sizes match exactly.
         */
-        if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT)
+        if (!bdev || ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) {
-                r = scsi_verify_blk_ioctl(NULL, cmd);
+                int err = scsi_verify_blk_ioctl(NULL, cmd);
+                if (err)
+                        r = err;
+        }
        if (r == -ENOTCONN && !fatal_signal_pending(current))
                queue_work(kmultipathd, &m->process_queued_ios);
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 9584443c5614..7dfdb5c746d6 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -432,7 +432,7 @@ static int mirror_available(struct mirror_set *ms, struct bio *bio)
        region_t region = dm_rh_bio_to_region(ms->rh, bio);
        if (log->type->in_sync(log, region, 0))
-                return choose_mirror(ms,  bio->bi_sector) ? 1 : 0;
+                return choose_mirror(ms,  bio->bi_iter.bi_sector) ? 1 : 0;
        return 0;
 }
@@ -442,15 +442,15 @@ static int mirror_available(struct mirror_set *ms, struct bio *bio)
 */
 static sector_t map_sector(struct mirror *m, struct bio *bio)
 {
-        if (unlikely(!bio->bi_size))
+        if (unlikely(!bio->bi_iter.bi_size))
                return 0;
-        return m->offset + dm_target_offset(m->ms->ti, bio->bi_sector);
+        return m->offset + dm_target_offset(m->ms->ti, bio->bi_iter.bi_sector);
 }
 static void map_bio(struct mirror *m, struct bio *bio)
 {
        bio->bi_bdev = m->dev->bdev;
-        bio->bi_sector = map_sector(m, bio);
+        bio->bi_iter.bi_sector = map_sector(m, bio);
 }
 static void map_region(struct dm_io_region *io, struct mirror *m,
@@ -526,8 +526,8 @@ static void read_async_bio(struct mirror *m, struct bio *bio)
        struct dm_io_region io;
        struct dm_io_request io_req = {
                .bi_rw = READ,
-                .mem.type = DM_IO_BVEC,
+                .mem.type = DM_IO_BIO,
-                .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
+                .mem.ptr.bio = bio,
                .notify.fn = read_callback,
                .notify.context = bio,
                .client = m->ms->io_client,
@@ -559,7 +559,7 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads)
                 * We can only read balance if the region is in sync.
                 */
                if (likely(region_in_sync(ms, region, 1)))
-                        m = choose_mirror(ms, bio->bi_sector);
+                        m = choose_mirror(ms, bio->bi_iter.bi_sector);
                else if (m && atomic_read(&m->error_count))
                        m = NULL;
@@ -629,8 +629,8 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
        struct mirror *m;
        struct dm_io_request io_req = {
                .bi_rw = WRITE | (bio->bi_rw & WRITE_FLUSH_FUA),
-                .mem.type = DM_IO_BVEC,
+                .mem.type = DM_IO_BIO,
-                .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
+                .mem.ptr.bio = bio,
                .notify.fn = write_callback,
                .notify.context = bio,
                .client = ms->io_client,
@@ -1181,7 +1181,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio)
         * The region is in-sync and we can perform reads directly.
         * Store enough information so we can retry if it fails.
         */
-        m = choose_mirror(ms, bio->bi_sector);
+        m = choose_mirror(ms, bio->bi_iter.bi_sector);
        if (unlikely(!m))
                return -EIO;
@@ -1244,6 +1244,9 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
                        dm_bio_restore(bd, bio);
                        bio_record->details.bi_bdev = NULL;
+                        atomic_inc(&bio->bi_remaining);
                        queue_bio(ms, bio, rw);
                        return DM_ENDIO_INCOMPLETE;
                }
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 69732e03eb34..b929fd5f4984 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -126,7 +126,8 @@ EXPORT_SYMBOL_GPL(dm_rh_region_to_sector);
 region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio)
 {
-        return dm_rh_sector_to_region(rh, bio->bi_sector - rh->target_begin);
+        return dm_rh_sector_to_region(rh, bio->bi_iter.bi_sector -
+                                      rh->target_begin);
 }
 EXPORT_SYMBOL_GPL(dm_rh_bio_to_region);
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 2d2b1b7588d7..d6e88178d22c 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -13,10 +13,13 @@
 #include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/dm-io.h>
+#include "dm-bufio.h"
 #define DM_MSG_PREFIX "persistent snapshot"
 #define DM_CHUNK_SIZE_DEFAULT_SECTORS 32        /* 16KB */
+#define DM_PREFETCH_CHUNKS              12
 /*-----------------------------------------------------------------
 * Persistent snapshots, by persistent we mean that the snapshot
 * will survive a reboot.
@@ -257,6 +260,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
        INIT_WORK_ONSTACK(&req.work, do_metadata);
        queue_work(ps->metadata_wq, &req.work);
        flush_workqueue(ps->metadata_wq);
+        destroy_work_on_stack(&req.work);
        return req.result;
 }
@@ -401,17 +405,18 @@ static int write_header(struct pstore *ps)
 /*
 * Access functions for the disk exceptions, these do the endian conversions.
 */
-static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
+static struct disk_exception *get_exception(struct pstore *ps, void *ps_area,
+                                            uint32_t index)
 {
        BUG_ON(index >= ps->exceptions_per_area);
-        return ((struct disk_exception *) ps->area) + index;
+        return ((struct disk_exception *) ps_area) + index;
 }
-static void read_exception(struct pstore *ps,
+static void read_exception(struct pstore *ps, void *ps_area,
                           uint32_t index, struct core_exception *result)
 {
-        struct disk_exception *de = get_exception(ps, index);
+        struct disk_exception *de = get_exception(ps, ps_area, index);
        /* copy it */
        result->old_chunk = le64_to_cpu(de->old_chunk);
@@ -421,7 +426,7 @@ static void read_exception(struct pstore *ps,
 static void write_exception(struct pstore *ps,
                            uint32_t index, struct core_exception *e)
 {
-        struct disk_exception *de = get_exception(ps, index);
+        struct disk_exception *de = get_exception(ps, ps->area, index);
        /* copy it */
        de->old_chunk = cpu_to_le64(e->old_chunk);
@@ -430,7 +435,7 @@ static void write_exception(struct pstore *ps,
 static void clear_exception(struct pstore *ps, uint32_t index)
 {
-        struct disk_exception *de = get_exception(ps, index);
+        struct disk_exception *de = get_exception(ps, ps->area, index);
        /* clear it */
        de->old_chunk = 0;
@@ -442,7 +447,7 @@ static void clear_exception(struct pstore *ps, uint32_t index)
 * 'full' is filled in to indicate if the area has been
 * filled.
 */
-static int insert_exceptions(struct pstore *ps,
+static int insert_exceptions(struct pstore *ps, void *ps_area,
                             int (*callback)(void *callback_context,
                                             chunk_t old, chunk_t new),
                             void *callback_context,
@@ -456,7 +461,7 @@ static int insert_exceptions(struct pstore *ps,
        *full = 1;
        for (i = 0; i < ps->exceptions_per_area; i++) {
-                read_exception(ps, i, &e);
+                read_exception(ps, ps_area, i, &e);
                /*
                 * If the new_chunk is pointing at the start of
@@ -493,26 +498,75 @@ static int read_exceptions(struct pstore *ps,
                           void *callback_context)
 {
        int r, full = 1;
+        struct dm_bufio_client *client;
+        chunk_t prefetch_area = 0;
+        client = dm_bufio_client_create(dm_snap_cow(ps->store->snap)->bdev,
+                                        ps->store->chunk_size << SECTOR_SHIFT,
+                                        1, 0, NULL, NULL);
+        if (IS_ERR(client))
+                return PTR_ERR(client);
+        /*
+         * Setup for one current buffer + desired readahead buffers.
+         */
+        dm_bufio_set_minimum_buffers(client, 1 + DM_PREFETCH_CHUNKS);
        /*
         * Keeping reading chunks and inserting exceptions until
         * we find a partially full area.
         */
        for (ps->current_area = 0; full; ps->current_area++) {
-                r = area_io(ps, READ);
+                struct dm_buffer *bp;
-                if (r)
+                void *area;
-                        return r;
+                chunk_t chunk;
+                if (unlikely(prefetch_area < ps->current_area))
+                        prefetch_area = ps->current_area;
+                if (DM_PREFETCH_CHUNKS) do {
+                        chunk_t pf_chunk = area_location(ps, prefetch_area);
+                        if (unlikely(pf_chunk >= dm_bufio_get_device_size(client)))
+                                break;
+                        dm_bufio_prefetch(client, pf_chunk, 1);
+                        prefetch_area++;
+                        if (unlikely(!prefetch_area))
+                                break;
+                } while (prefetch_area <= ps->current_area + DM_PREFETCH_CHUNKS);
+                chunk = area_location(ps, ps->current_area);
+                area = dm_bufio_read(client, chunk, &bp);
+                if (unlikely(IS_ERR(area))) {
+                        r = PTR_ERR(area);
+                        goto ret_destroy_bufio;
+                }
-                r = insert_exceptions(ps, callback, callback_context, &full);
+                r = insert_exceptions(ps, area, callback, callback_context,
-                if (r)
+                                      &full);
-                        return r;
+                if (!full)
+                        memcpy(ps->area, area, ps->store->chunk_size << SECTOR_SHIFT);
+                dm_bufio_release(bp);
+                dm_bufio_forget(client, chunk);
+                if (unlikely(r))
+                        goto ret_destroy_bufio;
        }
        ps->current_area--;
        skip_metadata(ps);
-        return 0;
+        r = 0;
+ret_destroy_bufio:
+        dm_bufio_client_destroy(client);
+        return r;
 }
 static struct pstore *get_info(struct dm_exception_store *store)
@@ -733,7 +787,7 @@ static int persistent_prepare_merge(struct dm_exception_store *store,
                ps->current_committed = ps->exceptions_per_area;
        }
-        read_exception(ps, ps->current_committed - 1, &ce);
+        read_exception(ps, ps->area, ps->current_committed - 1, &ce);
        *last_old_chunk = ce.old_chunk;
        *last_new_chunk = ce.new_chunk;
@@ -743,8 +797,8 @@ static int persistent_prepare_merge(struct dm_exception_store *store,
         */
        for (nr_consecutive = 1; nr_consecutive < ps->current_committed;
             nr_consecutive++) {
-                read_exception(ps, ps->current_committed - 1 - nr_consecutive,
+                read_exception(ps, ps->area,
-                               &ce);
+                               ps->current_committed - 1 - nr_consecutive, &ce);
                if (ce.old_chunk != *last_old_chunk - nr_consecutive ||
                    ce.new_chunk != *last_new_chunk - nr_consecutive)
                        break;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 944690bafd93..ebddef5237e4 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -610,12 +610,12 @@ static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et,
        return NULL;
 }
-static struct dm_exception *alloc_completed_exception(void)
+static struct dm_exception *alloc_completed_exception(gfp_t gfp)
 {
        struct dm_exception *e;
-        e = kmem_cache_alloc(exception_cache, GFP_NOIO);
+        e = kmem_cache_alloc(exception_cache, gfp);
-        if (!e)
+        if (!e && gfp == GFP_NOIO)
                e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
        return e;
@@ -697,7 +697,7 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
        struct dm_snapshot *s = context;
        struct dm_exception *e;
-        e = alloc_completed_exception();
+        e = alloc_completed_exception(GFP_KERNEL);
        if (!e)
                return -ENOMEM;
@@ -1405,7 +1405,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
                goto out;
        }
-        e = alloc_completed_exception();
+        e = alloc_completed_exception(GFP_NOIO);
        if (!e) {
                down_write(&s->lock);
                __invalidate_snapshot(s, -ENOMEM);
@@ -1438,6 +1438,7 @@ out:
        if (full_bio) {
                full_bio->bi_end_io = pe->full_bio_end_io;
                full_bio->bi_private = pe->full_bio_private;
+                atomic_inc(&full_bio->bi_remaining);
        }
        free_pending_exception(pe);
@@ -1619,11 +1620,10 @@ static void remap_exception(struct dm_snapshot *s, struct dm_exception *e,
                            struct bio *bio, chunk_t chunk)
 {
        bio->bi_bdev = s->cow->bdev;
-        bio->bi_sector = chunk_to_sector(s->store,
+        bio->bi_iter.bi_sector =
-                                         dm_chunk_number(e->new_chunk) +
+                chunk_to_sector(s->store, dm_chunk_number(e->new_chunk) +
-                                         (chunk - e->old_chunk)) +
+                                (chunk - e->old_chunk)) +
-                                         (bio->bi_sector &
+                (bio->bi_iter.bi_sector & s->store->chunk_mask);
-                                          s->store->chunk_mask);
 }
 static int snapshot_map(struct dm_target *ti, struct bio *bio)
@@ -1641,7 +1641,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
                return DM_MAPIO_REMAPPED;
        }
-        chunk = sector_to_chunk(s->store, bio->bi_sector);
+        chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
        /* Full snapshots are not usable */
        /* To get here the table must be live so s->active is always set. */
@@ -1702,7 +1702,8 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
                r = DM_MAPIO_SUBMITTED;
                if (!pe->started &&
-                    bio->bi_size == (s->store->chunk_size << SECTOR_SHIFT)) {
+                    bio->bi_iter.bi_size ==
+                    (s->store->chunk_size << SECTOR_SHIFT)) {
                        pe->started = 1;
                        up_write(&s->lock);
                        start_full_bio(pe, bio);
@@ -1758,7 +1759,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
                return DM_MAPIO_REMAPPED;
        }
-        chunk = sector_to_chunk(s->store, bio->bi_sector);
+        chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
        down_write(&s->lock);
@@ -2095,7 +2096,7 @@ static int do_origin(struct dm_dev *origin, struct bio *bio)
        down_read(&_origins_lock);
        o = __lookup_origin(origin->bdev);
        if (o)
-                r = __origin_write(&o->snapshots, bio->bi_sector, bio);
+                r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio);
        up_read(&_origins_lock);
        return r;
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 73c1712dad96..d1600d2aa2e2 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -259,13 +259,15 @@ static int stripe_map_range(struct stripe_c *sc, struct bio *bio,
 {
        sector_t begin, end;
-        stripe_map_range_sector(sc, bio->bi_sector, target_stripe, &begin);
+        stripe_map_range_sector(sc, bio->bi_iter.bi_sector,
+                                target_stripe, &begin);
        stripe_map_range_sector(sc, bio_end_sector(bio),
                                target_stripe, &end);
        if (begin < end) {
                bio->bi_bdev = sc->stripe[target_stripe].dev->bdev;
-                bio->bi_sector = begin + sc->stripe[target_stripe].physical_start;
+                bio->bi_iter.bi_sector = begin +
-                bio->bi_size = to_bytes(end - begin);
+                        sc->stripe[target_stripe].physical_start;
+                bio->bi_iter.bi_size = to_bytes(end - begin);
                return DM_MAPIO_REMAPPED;
        } else {
                /* The range doesn't map to the target stripe */
@@ -293,9 +295,10 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
                return stripe_map_range(sc, bio, target_bio_nr);
        }
-        stripe_map_sector(sc, bio->bi_sector, &stripe, &bio->bi_sector);
+        stripe_map_sector(sc, bio->bi_iter.bi_sector,
+                          &stripe, &bio->bi_iter.bi_sector);
-        bio->bi_sector += sc->stripe[stripe].physical_start;
+        bio->bi_iter.bi_sector += sc->stripe[stripe].physical_start;
        bio->bi_bdev = sc->stripe[stripe].dev->bdev;
        return DM_MAPIO_REMAPPED;
diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c
index ff9ac4be4721..09a688b3d48c 100644
--- a/drivers/md/dm-switch.c
+++ b/drivers/md/dm-switch.c
@@ -311,11 +311,11 @@ error:
 static int switch_map(struct dm_target *ti, struct bio *bio)
 {
        struct switch_ctx *sctx = ti->private;
-        sector_t offset = dm_target_offset(ti, bio->bi_sector);
+        sector_t offset = dm_target_offset(ti, bio->bi_iter.bi_sector);
        unsigned path_nr = switch_get_path_nr(sctx, offset);
        bio->bi_bdev = sctx->path_list[path_nr].dmdev->bdev;
-        bio->bi_sector = sctx->path_list[path_nr].start + offset;
+        bio->bi_iter.bi_sector = sctx->path_list[path_nr].start + offset;
        return DM_MAPIO_REMAPPED;
 }
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index 84d2b91e4efb..c62c5ab6aed5 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -86,6 +86,7 @@ static const struct sysfs_ops dm_sysfs_ops = {
 static struct kobj_type dm_ktype = {
        .sysfs_ops      = &dm_sysfs_ops,
        .default_attrs  = dm_attrs,
+        .release        = dm_kobject_release,
 };
 /*
@@ -104,5 +105,7 @@ int dm_sysfs_init(struct mapped_device *md)
 */
 void dm_sysfs_exit(struct mapped_device *md)
 {
-        kobject_put(dm_kobject(md));
+        struct kobject *kobj = dm_kobject(md);
+        kobject_put(kobj);
+        wait_for_completion(dm_get_completion_from_kobject(kobj));
 }
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 3ba6a3859ce3..6a7f2b83a126 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -155,7 +155,6 @@ static int alloc_targets(struct dm_table *t, unsigned int num)
 {
        sector_t *n_highs;
        struct dm_target *n_targets;
-        int n = t->num_targets;
        /*
         * Allocate both the target array and offset array at once.
@@ -169,12 +168,7 @@ static int alloc_targets(struct dm_table *t, unsigned int num)
        n_targets = (struct dm_target *) (n_highs + num);
-        if (n) {
+        memset(n_highs, -1, sizeof(*n_highs) * num);
-                memcpy(n_highs, t->highs, sizeof(*n_highs) * n);
-                memcpy(n_targets, t->targets, sizeof(*n_targets) * n);
-        }
-        memset(n_highs + n, -1, sizeof(*n_highs) * (num - n));
        vfree(t->highs);
        t->num_allocated = num;
@@ -261,17 +255,6 @@ void dm_table_destroy(struct dm_table *t)
 }
 /*
- * Checks to see if we need to extend highs or targets.
- */
-static inline int check_space(struct dm_table *t)
-{
-        if (t->num_targets >= t->num_allocated)
-                return alloc_targets(t, t->num_allocated * 2);
-        return 0;
-}
-/*
 * See if we've already got a device in the list.
 */
 static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev)
@@ -731,8 +714,7 @@ int dm_table_add_target(struct dm_table *t, const char *type,
                return -EINVAL;
        }
-        if ((r = check_space(t)))
+        BUG_ON(t->num_targets >= t->num_allocated);
-                return r;
        tgt = t->targets + t->num_targets;
        memset(tgt, 0, sizeof(*tgt));
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 8a30ad54bd46..fb9efc829182 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -76,7 +76,7 @@
 #define THIN_SUPERBLOCK_MAGIC 27022010
 #define THIN_SUPERBLOCK_LOCATION 0
-#define THIN_VERSION 1
+#define THIN_VERSION 2
 #define THIN_METADATA_CACHE_SIZE 64
 #define SECTOR_TO_BLOCK_SHIFT 3
@@ -483,7 +483,7 @@ static int __write_initial_superblock(struct dm_pool_metadata *pmd)
        disk_super->data_mapping_root = cpu_to_le64(pmd->root);
        disk_super->device_details_root = cpu_to_le64(pmd->details_root);
-        disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
+        disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE);
        disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT);
        disk_super->data_block_size = cpu_to_le32(pmd->data_block_size);
@@ -651,7 +651,7 @@ static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool f
 {
        int r;
-        pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE,
+        pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
                                          THIN_METADATA_CACHE_SIZE,
                                          THIN_MAX_CONCURRENT_LOCKS);
        if (IS_ERR(pmd->bm)) {
@@ -1349,6 +1349,12 @@ dm_thin_id dm_thin_dev_id(struct dm_thin_device *td)
        return td->id;
 }
+/*
+ * Check whether @time (of block creation) is older than @td's last snapshot.
+ * If so then the associated block is shared with the last snapshot device.
+ * Any block on a device created *after* the device last got snapshotted is
+ * necessarily not shared.
+ */
 static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
 {
        return td->snapshotted_time > time;
@@ -1458,6 +1464,20 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
        return r;
 }
+int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result)
+{
+        int r;
+        uint32_t ref_count;
+        down_read(&pmd->root_lock);
+        r = dm_sm_get_count(pmd->data_sm, b, &ref_count);
+        if (!r)
+                *result = (ref_count != 0);
+        up_read(&pmd->root_lock);
+        return r;
+}
 bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
 {
        int r;
@@ -1469,6 +1489,23 @@ bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
        return r;
 }
+bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd)
+{
+        bool r = false;
+        struct dm_thin_device *td, *tmp;
+        down_read(&pmd->root_lock);
+        list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
+                if (td->changed) {
+                        r = td->changed;
+                        break;
+                }
+        }
+        up_read(&pmd->root_lock);
+        return r;
+}
 bool dm_thin_aborted_changes(struct dm_thin_device *td)
 {
        bool r;
@@ -1718,3 +1755,38 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
        return r;
 }
+int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
+{
+        int r;
+        struct dm_block *sblock;
+        struct thin_disk_superblock *disk_super;
+        down_write(&pmd->root_lock);
+        pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG;
+        r = superblock_lock(pmd, &sblock);
+        if (r) {
+                DMERR("couldn't read superblock");
+                goto out;
+        }
+        disk_super = dm_block_data(sblock);
+        disk_super->flags = cpu_to_le32(pmd->flags);
+        dm_bm_unlock(sblock);
+out:
+        up_write(&pmd->root_lock);
+        return r;
+}
+bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd)
+{
+        bool needs_check;
+        down_read(&pmd->root_lock);
+        needs_check = pmd->flags & THIN_METADATA_NEEDS_CHECK_FLAG;
+        up_read(&pmd->root_lock);
+        return needs_check;
+}
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index 7bcc0e1d6238..e3c857db195a 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -9,16 +9,14 @@
 #include "persistent-data/dm-block-manager.h"
 #include "persistent-data/dm-space-map.h"
+#include "persistent-data/dm-space-map-metadata.h"
-#define THIN_METADATA_BLOCK_SIZE 4096
+#define THIN_METADATA_BLOCK_SIZE DM_SM_METADATA_BLOCK_SIZE
 /*
 * The metadata device is currently limited in size.
- *
- * We have one block of index, which can hold 255 index entries.  Each
- * index entry contains allocation info about 16k metadata blocks.
 */
-#define THIN_METADATA_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
+#define THIN_METADATA_MAX_SECTORS DM_SM_METADATA_MAX_SECTORS
 /*
 * A metadata device larger than 16GB triggers a warning.
@@ -27,6 +25,11 @@
 /*----------------------------------------------------------------*/
+/*
+ * Thin metadata superblock flags.
+ */
+#define THIN_METADATA_NEEDS_CHECK_FLAG (1 << 0)
 struct dm_pool_metadata;
 struct dm_thin_device;
@@ -131,7 +134,7 @@ dm_thin_id dm_thin_dev_id(struct dm_thin_device *td);
 struct dm_thin_lookup_result {
        dm_block_t block;
-        unsigned shared:1;
+        bool shared:1;
 };
 /*
@@ -161,6 +164,8 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block);
 */
 bool dm_thin_changed_this_transaction(struct dm_thin_device *td);
+bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd);
 bool dm_thin_aborted_changes(struct dm_thin_device *td);
 int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
@@ -181,6 +186,8 @@ int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result);
 int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result);
+int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result);
 /*
 * Returns -ENOSPC if the new size is too small and already allocated
 * blocks would be lost.
@@ -200,6 +207,12 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
                                        dm_sm_threshold_fn fn,
                                        void *context);
+/*
+ * Updates the superblock immediately.
+ */
+int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd);
+bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd);
 /*----------------------------------------------------------------*/
 #endif
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index ee29037ffc2e..be70d38745f7 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -130,10 +130,11 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
 struct dm_thin_new_mapping;
 /*
- * The pool runs in 3 modes.  Ordered in degraded order for comparisons.
+ * The pool runs in 4 modes.  Ordered in degraded order for comparisons.
 */
 enum pool_mode {
        PM_WRITE,               /* metadata may be changed */
+        PM_OUT_OF_DATA_SPACE,   /* metadata may be changed, though data may not be allocated */
        PM_READ_ONLY,           /* metadata may not be changed */
        PM_FAIL,                /* all I/O fails */
 };
@@ -144,6 +145,7 @@ struct pool_features {
        bool zero_new_blocks:1;
        bool discard_enabled:1;
        bool discard_passdown:1;
+        bool error_if_no_space:1;
 };
 struct thin_c;
@@ -163,8 +165,7 @@ struct pool {
        int sectors_per_block_shift;
        struct pool_features pf;
-        unsigned low_water_triggered:1; /* A dm event has been sent */
+        bool low_water_triggered:1;     /* A dm event has been sent */
-        unsigned no_free_space:1;       /* A -ENOSPC warning has been issued */
        struct dm_bio_prison *prison;
        struct dm_kcopyd_client *copier;
@@ -198,7 +199,7 @@ struct pool {
 };
 static enum pool_mode get_pool_mode(struct pool *pool);
-static void set_pool_mode(struct pool *pool, enum pool_mode mode);
+static void metadata_operation_failed(struct pool *pool, const char *op, int r);
 /*
 * Target context for a pool.
@@ -225,6 +226,7 @@ struct thin_c {
        struct pool *pool;
        struct dm_thin_device *td;
+        bool requeue_mode:1;
 };
 /*----------------------------------------------------------------*/
@@ -368,14 +370,18 @@ struct dm_thin_endio_hook {
        struct dm_thin_new_mapping *overwrite_mapping;
 };
-static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
+static void requeue_bio_list(struct thin_c *tc, struct bio_list *master)
 {
        struct bio *bio;
        struct bio_list bios;
+        unsigned long flags;
        bio_list_init(&bios);
+        spin_lock_irqsave(&tc->pool->lock, flags);
        bio_list_merge(&bios, master);
        bio_list_init(master);
+        spin_unlock_irqrestore(&tc->pool->lock, flags);
        while ((bio = bio_list_pop(&bios))) {
                struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
@@ -390,12 +396,26 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
 static void requeue_io(struct thin_c *tc)
 {
        struct pool *pool = tc->pool;
+        requeue_bio_list(tc, &pool->deferred_bios);
+        requeue_bio_list(tc, &pool->retry_on_resume_list);
+}
+static void error_retry_list(struct pool *pool)
+{
+        struct bio *bio;
        unsigned long flags;
+        struct bio_list bios;
+        bio_list_init(&bios);
        spin_lock_irqsave(&pool->lock, flags);
-        __requeue_bio_list(tc, &pool->deferred_bios);
+        bio_list_merge(&bios, &pool->retry_on_resume_list);
-        __requeue_bio_list(tc, &pool->retry_on_resume_list);
+        bio_list_init(&pool->retry_on_resume_list);
        spin_unlock_irqrestore(&pool->lock, flags);
+        while ((bio = bio_list_pop(&bios)))
+                bio_io_error(bio);
 }
 /*
@@ -413,7 +433,7 @@ static bool block_size_is_power_of_two(struct pool *pool)
 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
 {
        struct pool *pool = tc->pool;
-        sector_t block_nr = bio->bi_sector;
+        sector_t block_nr = bio->bi_iter.bi_sector;
        if (block_size_is_power_of_two(pool))
                block_nr >>= pool->sectors_per_block_shift;
@@ -426,14 +446,15 @@ static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
 {
        struct pool *pool = tc->pool;
-        sector_t bi_sector = bio->bi_sector;
+        sector_t bi_sector = bio->bi_iter.bi_sector;
        bio->bi_bdev = tc->pool_dev->bdev;
        if (block_size_is_power_of_two(pool))
-                bio->bi_sector = (block << pool->sectors_per_block_shift) |
+                bio->bi_iter.bi_sector =
-                                (bi_sector & (pool->sectors_per_block - 1));
+                        (block << pool->sectors_per_block_shift) |
+                        (bi_sector & (pool->sectors_per_block - 1));
        else
-                bio->bi_sector = (block * pool->sectors_per_block) +
+                bio->bi_iter.bi_sector = (block * pool->sectors_per_block) +
                                 sector_div(bi_sector, pool->sectors_per_block);
 }
@@ -509,15 +530,16 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio,
 struct dm_thin_new_mapping {
        struct list_head list;
-        unsigned quiesced:1;
+        bool quiesced:1;
-        unsigned prepared:1;
+        bool prepared:1;
-        unsigned pass_discard:1;
+        bool pass_discard:1;
+        bool definitely_not_shared:1;
+        int err;
        struct thin_c *tc;
        dm_block_t virt_block;
        dm_block_t data_block;
        struct dm_bio_prison_cell *cell, *cell2;
-        int err;
        /*
         * If the bio covers the whole area of a block then we can avoid
@@ -534,7 +556,7 @@ static void __maybe_add_mapping(struct dm_thin_new_mapping *m)
        struct pool *pool = m->tc->pool;
        if (m->quiesced && m->prepared) {
-                list_add(&m->list, &pool->prepared_mappings);
+                list_add_tail(&m->list, &pool->prepared_mappings);
                wake_worker(pool);
        }
 }
@@ -548,7 +570,7 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
        m->err = read_err || write_err ? -EIO : 0;
        spin_lock_irqsave(&pool->lock, flags);
-        m->prepared = 1;
+        m->prepared = true;
        __maybe_add_mapping(m);
        spin_unlock_irqrestore(&pool->lock, flags);
 }
@@ -563,7 +585,7 @@ static void overwrite_endio(struct bio *bio, int err)
        m->err = err;
        spin_lock_irqsave(&pool->lock, flags);
-        m->prepared = 1;
+        m->prepared = true;
        __maybe_add_mapping(m);
        spin_unlock_irqrestore(&pool->lock, flags);
 }
@@ -610,8 +632,10 @@ static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *c
 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
 {
-        if (m->bio)
+        if (m->bio) {
                m->bio->bi_end_io = m->saved_bi_end_io;
+                atomic_inc(&m->bio->bi_remaining);
+        }
        cell_error(m->tc->pool, m->cell);
        list_del(&m->list);
        mempool_free(m, m->tc->pool->mapping_pool);
@@ -625,8 +649,10 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
        int r;
        bio = m->bio;
-        if (bio)
+        if (bio) {
                bio->bi_end_io = m->saved_bi_end_io;
+                atomic_inc(&bio->bi_remaining);
+        }
        if (m->err) {
                cell_error(pool, m->cell);
@@ -640,9 +666,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
         */
        r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
        if (r) {
-                DMERR_LIMIT("%s: dm_thin_insert_block() failed: error = %d",
+                metadata_operation_failed(pool, "dm_thin_insert_block", r);
-                            dm_device_name(pool->pool_md), r);
-                set_pool_mode(pool, PM_READ_ONLY);
                cell_error(pool, m->cell);
                goto out;
        }
@@ -683,7 +707,15 @@ static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
        cell_defer_no_holder(tc, m->cell2);
        if (m->pass_discard)
-                remap_and_issue(tc, m->bio, m->data_block);
+                if (m->definitely_not_shared)
+                        remap_and_issue(tc, m->bio, m->data_block);
+                else {
+                        bool used = false;
+                        if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used)
+                                bio_endio(m->bio, 0);
+                        else
+                                remap_and_issue(tc, m->bio, m->data_block);
+                }
        else
                bio_endio(m->bio, 0);
@@ -723,7 +755,8 @@ static void process_prepared(struct pool *pool, struct list_head *head,
 */
 static int io_overlaps_block(struct pool *pool, struct bio *bio)
 {
-        return bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT);
+        return bio->bi_iter.bi_size ==
+                (pool->sectors_per_block << SECTOR_SHIFT);
 }
 static int io_overwrites_block(struct pool *pool, struct bio *bio)
@@ -751,13 +784,17 @@ static int ensure_next_mapping(struct pool *pool)
 static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
 {
-        struct dm_thin_new_mapping *r = pool->next_mapping;
+        struct dm_thin_new_mapping *m = pool->next_mapping;
        BUG_ON(!pool->next_mapping);
+        memset(m, 0, sizeof(struct dm_thin_new_mapping));
+        INIT_LIST_HEAD(&m->list);
+        m->bio = NULL;
        pool->next_mapping = NULL;
-        return r;
+        return m;
 }
 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
@@ -769,18 +806,13 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
        struct pool *pool = tc->pool;
        struct dm_thin_new_mapping *m = get_next_mapping(pool);
-        INIT_LIST_HEAD(&m->list);
-        m->quiesced = 0;
-        m->prepared = 0;
        m->tc = tc;
        m->virt_block = virt_block;
        m->data_block = data_dest;
        m->cell = cell;
-        m->err = 0;
-        m->bio = NULL;
        if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
-                m->quiesced = 1;
+                m->quiesced = true;
        /*
         * IO to pool_dev remaps to the pool target's data_dev.
@@ -840,15 +872,12 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
        struct pool *pool = tc->pool;
        struct dm_thin_new_mapping *m = get_next_mapping(pool);
-        INIT_LIST_HEAD(&m->list);
+        m->quiesced = true;
-        m->quiesced = 1;
+        m->prepared = false;
-        m->prepared = 0;
        m->tc = tc;
        m->virt_block = virt_block;
        m->data_block = data_block;
        m->cell = cell;
-        m->err = 0;
-        m->bio = NULL;
        /*
         * If the whole block of data is being overwritten or we are not
@@ -895,41 +924,44 @@ static int commit(struct pool *pool)
                return -EINVAL;
        r = dm_pool_commit_metadata(pool->pmd);
-        if (r) {
+        if (r)
-                DMERR_LIMIT("%s: dm_pool_commit_metadata failed: error = %d",
+                metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
-                            dm_device_name(pool->pool_md), r);
-                set_pool_mode(pool, PM_READ_ONLY);
-        }
        return r;
 }
-static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
+static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
 {
-        int r;
-        dm_block_t free_blocks;
        unsigned long flags;
-        struct pool *pool = tc->pool;
-        /*
-         * Once no_free_space is set we must not allow allocation to succeed.
-         * Otherwise it is difficult to explain, debug, test and support.
-         */
-        if (pool->no_free_space)
-                return -ENOSPC;
-        r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
-        if (r)
-                return r;
        if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
                DMWARN("%s: reached low water mark for data device: sending event.",
                       dm_device_name(pool->pool_md));
                spin_lock_irqsave(&pool->lock, flags);
-                pool->low_water_triggered = 1;
+                pool->low_water_triggered = true;
                spin_unlock_irqrestore(&pool->lock, flags);
                dm_table_event(pool->ti->table);
        }
+}
+static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
+static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
+{
+        int r;
+        dm_block_t free_blocks;
+        struct pool *pool = tc->pool;
+        if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
+                return -EINVAL;
+        r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
+        if (r) {
+                metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
+                return r;
+        }
+        check_low_water_mark(pool, free_blocks);
        if (!free_blocks) {
                /*
@@ -941,35 +973,20 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
                        return r;
                r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
-                if (r)
+                if (r) {
+                        metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
                        return r;
+                }
-                /*
-                 * If we still have no space we set a flag to avoid
-                 * doing all this checking and return -ENOSPC.  This
-                 * flag serves as a latch that disallows allocations from
-                 * this pool until the admin takes action (e.g. resize or
-                 * table reload).
-                 */
                if (!free_blocks) {
-                        DMWARN("%s: no free data space available.",
+                        set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
-                               dm_device_name(pool->pool_md));
-                        spin_lock_irqsave(&pool->lock, flags);
-                        pool->no_free_space = 1;
-                        spin_unlock_irqrestore(&pool->lock, flags);
                        return -ENOSPC;
                }
        }
        r = dm_pool_alloc_data_block(pool->pmd, result);
        if (r) {
-                if (r == -ENOSPC &&
+                metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
-                    !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) &&
-                    !free_blocks) {
-                        DMWARN("%s: no free metadata space available.",
-                               dm_device_name(pool->pool_md));
-                        set_pool_mode(pool, PM_READ_ONLY);
-                }
                return r;
        }
@@ -992,16 +1009,56 @@ static void retry_on_resume(struct bio *bio)
        spin_unlock_irqrestore(&pool->lock, flags);
 }
-static void no_space(struct pool *pool, struct dm_bio_prison_cell *cell)
+static bool should_error_unserviceable_bio(struct pool *pool)
+{
+        enum pool_mode m = get_pool_mode(pool);
+        switch (m) {
+        case PM_WRITE:
+                /* Shouldn't get here */
+                DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
+                return true;
+        case PM_OUT_OF_DATA_SPACE:
+                return pool->pf.error_if_no_space;
+        case PM_READ_ONLY:
+        case PM_FAIL:
+                return true;
+        default:
+                /* Shouldn't get here */
+                DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
+                return true;
+        }
+}
+static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
+{
+        if (should_error_unserviceable_bio(pool))
+                bio_io_error(bio);
+        else
+                retry_on_resume(bio);
+}
+static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell)
 {
        struct bio *bio;
        struct bio_list bios;
+        if (should_error_unserviceable_bio(pool)) {
+                cell_error(pool, cell);
+                return;
+        }
        bio_list_init(&bios);
        cell_release(pool, cell, &bios);
-        while ((bio = bio_list_pop(&bios)))
+        if (should_error_unserviceable_bio(pool))
-                retry_on_resume(bio);
+                while ((bio = bio_list_pop(&bios)))
+                        bio_io_error(bio);
+        else
+                while ((bio = bio_list_pop(&bios)))
+                        retry_on_resume(bio);
 }
 static void process_discard(struct thin_c *tc, struct bio *bio)
@@ -1040,17 +1097,17 @@ static void process_discard(struct thin_c *tc, struct bio *bio)
                         */
                        m = get_next_mapping(pool);
                        m->tc = tc;
-                        m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown;
+                        m->pass_discard = pool->pf.discard_passdown;
+                        m->definitely_not_shared = !lookup_result.shared;
                        m->virt_block = block;
                        m->data_block = lookup_result.block;
                        m->cell = cell;
                        m->cell2 = cell2;
-                        m->err = 0;
                        m->bio = bio;
                        if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) {
                                spin_lock_irqsave(&pool->lock, flags);
-                                list_add(&m->list, &pool->prepared_discards);
+                                list_add_tail(&m->list, &pool->prepared_discards);
                                spin_unlock_irqrestore(&pool->lock, flags);
                                wake_worker(pool);
                        }
@@ -1105,13 +1162,12 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
                break;
        case -ENOSPC:
-                no_space(pool, cell);
+                retry_bios_on_resume(pool, cell);
                break;
        default:
                DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
                            __func__, r);
-                set_pool_mode(pool, PM_READ_ONLY);
                cell_error(pool, cell);
                break;
        }
@@ -1133,7 +1189,7 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
        if (bio_detain(pool, &key, bio, &cell))
                return;
-        if (bio_data_dir(bio) == WRITE && bio->bi_size)
+        if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size)
                break_sharing(tc, bio, block, &key, lookup_result, cell);
        else {
                struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
@@ -1156,7 +1212,7 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
        /*
         * Remap empty bios (flushes) immediately, without provisioning.
         */
-        if (!bio->bi_size) {
+        if (!bio->bi_iter.bi_size) {
                inc_all_io_entry(pool, bio);
                cell_defer_no_holder(tc, cell);
@@ -1184,13 +1240,12 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
                break;
        case -ENOSPC:
-                no_space(pool, cell);
+                retry_bios_on_resume(pool, cell);
                break;
        default:
                DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
                            __func__, r);
-                set_pool_mode(pool, PM_READ_ONLY);
                cell_error(pool, cell);
                break;
        }
@@ -1256,8 +1311,8 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
        r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
        switch (r) {
        case 0:
-                if (lookup_result.shared && (rw == WRITE) && bio->bi_size)
+                if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size)
-                        bio_io_error(bio);
+                        handle_unserviceable_bio(tc->pool, bio);
                else {
                        inc_all_io_entry(tc->pool, bio);
                        remap_and_issue(tc, bio, lookup_result.block);
@@ -1266,7 +1321,7 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
        case -ENODATA:
                if (rw != READ) {
-                        bio_io_error(bio);
+                        handle_unserviceable_bio(tc->pool, bio);
                        break;
                }
@@ -1288,6 +1343,11 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
        }
 }
+static void process_bio_success(struct thin_c *tc, struct bio *bio)
+{
+        bio_endio(bio, 0);
+}
 static void process_bio_fail(struct thin_c *tc, struct bio *bio)
 {
        bio_io_error(bio);
@@ -1320,6 +1380,11 @@ static void process_deferred_bios(struct pool *pool)
                struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
                struct thin_c *tc = h->tc;
+                if (tc->requeue_mode) {
+                        bio_endio(bio, DM_ENDIO_REQUEUE);
+                        continue;
+                }
                /*
                 * If we've got no free new_mapping structs, and processing
                 * this bio might require one, we pause until there are some
@@ -1349,7 +1414,8 @@ static void process_deferred_bios(struct pool *pool)
        bio_list_init(&pool->deferred_flush_bios);
        spin_unlock_irqrestore(&pool->lock, flags);
-        if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
+        if (bio_list_empty(&bios) &&
+            !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
                return;
        if (commit(pool)) {
@@ -1385,46 +1451,134 @@ static void do_waker(struct work_struct *ws)
 /*----------------------------------------------------------------*/
+struct noflush_work {
+        struct work_struct worker;
+        struct thin_c *tc;
+        atomic_t complete;
+        wait_queue_head_t wait;
+};
+static void complete_noflush_work(struct noflush_work *w)
+{
+        atomic_set(&w->complete, 1);
+        wake_up(&w->wait);
+}
+static void do_noflush_start(struct work_struct *ws)
+{
+        struct noflush_work *w = container_of(ws, struct noflush_work, worker);
+        w->tc->requeue_mode = true;
+        requeue_io(w->tc);
+        complete_noflush_work(w);
+}
+static void do_noflush_stop(struct work_struct *ws)
+{
+        struct noflush_work *w = container_of(ws, struct noflush_work, worker);
+        w->tc->requeue_mode = false;
+        complete_noflush_work(w);
+}
+static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))
+{
+        struct noflush_work w;
+        INIT_WORK(&w.worker, fn);
+        w.tc = tc;
+        atomic_set(&w.complete, 0);
+        init_waitqueue_head(&w.wait);
+        queue_work(tc->pool->wq, &w.worker);
+        wait_event(w.wait, atomic_read(&w.complete));
+}
+/*----------------------------------------------------------------*/
 static enum pool_mode get_pool_mode(struct pool *pool)
 {
        return pool->pf.mode;
 }
-static void set_pool_mode(struct pool *pool, enum pool_mode mode)
+static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
 {
-        int r;
+        dm_table_event(pool->ti->table);
+        DMINFO("%s: switching pool to %s mode",
+               dm_device_name(pool->pool_md), new_mode);
+}
-        pool->pf.mode = mode;
+static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
+{
+        struct pool_c *pt = pool->ti->private;
+        bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
+        enum pool_mode old_mode = get_pool_mode(pool);
-        switch (mode) {
+        /*
-        case PM_FAIL:
+         * Never allow the pool to transition to PM_WRITE mode if user
-                DMERR("%s: switching pool to failure mode",
+         * intervention is required to verify metadata and data consistency.
+         */
+        if (new_mode == PM_WRITE && needs_check) {
+                DMERR("%s: unable to switch pool to write mode until repaired.",
                      dm_device_name(pool->pool_md));
+                if (old_mode != new_mode)
+                        new_mode = old_mode;
+                else
+                        new_mode = PM_READ_ONLY;
+        }
+        /*
+         * If we were in PM_FAIL mode, rollback of metadata failed.  We're
+         * not going to recover without a thin_repair.  So we never let the
+         * pool move out of the old mode.
+         */
+        if (old_mode == PM_FAIL)
+                new_mode = old_mode;
+        switch (new_mode) {
+        case PM_FAIL:
+                if (old_mode != new_mode)
+                        notify_of_pool_mode_change(pool, "failure");
                dm_pool_metadata_read_only(pool->pmd);
                pool->process_bio = process_bio_fail;
                pool->process_discard = process_bio_fail;
                pool->process_prepared_mapping = process_prepared_mapping_fail;
                pool->process_prepared_discard = process_prepared_discard_fail;
+                error_retry_list(pool);
                break;
        case PM_READ_ONLY:
-                DMERR("%s: switching pool to read-only mode",
+                if (old_mode != new_mode)
-                      dm_device_name(pool->pool_md));
+                        notify_of_pool_mode_change(pool, "read-only");
-                r = dm_pool_abort_metadata(pool->pmd);
+                dm_pool_metadata_read_only(pool->pmd);
-                if (r) {
+                pool->process_bio = process_bio_read_only;
-                        DMERR("%s: aborting transaction failed",
+                pool->process_discard = process_bio_success;
-                              dm_device_name(pool->pool_md));
+                pool->process_prepared_mapping = process_prepared_mapping_fail;
-                        set_pool_mode(pool, PM_FAIL);
+                pool->process_prepared_discard = process_prepared_discard_passdown;
-                } else {
-                        dm_pool_metadata_read_only(pool->pmd);
+                error_retry_list(pool);
-                        pool->process_bio = process_bio_read_only;
+                break;
-                        pool->process_discard = process_discard;
-                        pool->process_prepared_mapping = process_prepared_mapping_fail;
+        case PM_OUT_OF_DATA_SPACE:
-                        pool->process_prepared_discard = process_prepared_discard_passdown;
+                /*
-                }
+                 * Ideally we'd never hit this state; the low water mark
+                 * would trigger userland to extend the pool before we
+                 * completely run out of data space.  However, many small
+                 * IOs to unprovisioned space can consume data space at an
+                 * alarming rate.  Adjust your low water mark if you're
+                 * frequently seeing this mode.
+                 */
+                if (old_mode != new_mode)
+                        notify_of_pool_mode_change(pool, "out-of-data-space");
+                pool->process_bio = process_bio_read_only;
+                pool->process_discard = process_discard;
+                pool->process_prepared_mapping = process_prepared_mapping;
+                pool->process_prepared_discard = process_prepared_discard_passdown;
                break;
        case PM_WRITE:
+                if (old_mode != new_mode)
+                        notify_of_pool_mode_change(pool, "write");
                dm_pool_metadata_read_write(pool->pmd);
                pool->process_bio = process_bio;
                pool->process_discard = process_discard;
@@ -1432,6 +1586,38 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode)
                pool->process_prepared_discard = process_prepared_discard;
                break;
        }
+        pool->pf.mode = new_mode;
+        /*
+         * The pool mode may have changed, sync it so bind_control_target()
+         * doesn't cause an unexpected mode transition on resume.
+         */
+        pt->adjusted_pf.mode = new_mode;
+}
+static void abort_transaction(struct pool *pool)
+{
+        const char *dev_name = dm_device_name(pool->pool_md);
+        DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
+        if (dm_pool_abort_metadata(pool->pmd)) {
+                DMERR("%s: failed to abort metadata transaction", dev_name);
+                set_pool_mode(pool, PM_FAIL);
+        }
+        if (dm_pool_metadata_set_needs_check(pool->pmd)) {
+                DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
+                set_pool_mode(pool, PM_FAIL);
+        }
+}
+static void metadata_operation_failed(struct pool *pool, const char *op, int r)
+{
+        DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
+                    dm_device_name(pool->pool_md), op, r);
+        abort_transaction(pool);
+        set_pool_mode(pool, PM_READ_ONLY);
 }
 /*----------------------------------------------------------------*/
@@ -1481,6 +1667,11 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
        thin_hook_bio(tc, bio);
+        if (tc->requeue_mode) {
+                bio_endio(bio, DM_ENDIO_REQUEUE);
+                return DM_MAPIO_SUBMITTED;
+        }
        if (get_pool_mode(tc->pool) == PM_FAIL) {
                bio_io_error(bio);
                return DM_MAPIO_SUBMITTED;
@@ -1538,9 +1729,9 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
                if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
                        /*
                         * This block isn't provisioned, and we have no way
-                         * of doing so.  Just error it.
+                         * of doing so.
                         */
-                        bio_io_error(bio);
+                        handle_unserviceable_bio(tc->pool, bio);
                        return DM_MAPIO_SUBMITTED;
                }
                /* fall through */
@@ -1644,22 +1835,19 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
        /*
         * We want to make sure that a pool in PM_FAIL mode is never upgraded.
         */
-        enum pool_mode old_mode = pool->pf.mode;
+        enum pool_mode old_mode = get_pool_mode(pool);
        enum pool_mode new_mode = pt->adjusted_pf.mode;
        /*
-         * If we were in PM_FAIL mode, rollback of metadata failed.  We're
+         * Don't change the pool's mode until set_pool_mode() below.
-         * not going to recover without a thin_repair.  So we never let the
+         * Otherwise the pool's process_* function pointers may
-         * pool move out of the old mode.  On the other hand a PM_READ_ONLY
+         * not match the desired pool mode.
-         * may have been due to a lack of metadata or data space, and may
-         * now work (ie. if the underlying devices have been resized).
         */
-        if (old_mode == PM_FAIL)
+        pt->adjusted_pf.mode = old_mode;
-                new_mode = old_mode;
        pool->ti = ti;
-        pool->low_water_blocks = pt->low_water_blocks;
        pool->pf = pt->adjusted_pf;
+        pool->low_water_blocks = pt->low_water_blocks;
        set_pool_mode(pool, new_mode);
@@ -1682,6 +1870,7 @@ static void pool_features_init(struct pool_features *pf)
        pf->zero_new_blocks = true;
        pf->discard_enabled = true;
        pf->discard_passdown = true;
+        pf->error_if_no_space = false;
 }
 static void __pool_destroy(struct pool *pool)
@@ -1772,8 +1961,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
        bio_list_init(&pool->deferred_flush_bios);
        INIT_LIST_HEAD(&pool->prepared_mappings);
        INIT_LIST_HEAD(&pool->prepared_discards);
-        pool->low_water_triggered = 0;
+        pool->low_water_triggered = false;
-        pool->no_free_space = 0;
        bio_list_init(&pool->retry_on_resume_list);
        pool->shared_read_ds = dm_deferred_set_create();
@@ -1898,7 +2086,7 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
        const char *arg_name;
        static struct dm_arg _args[] = {
-                {0, 3, "Invalid number of pool feature arguments"},
+                {0, 4, "Invalid number of pool feature arguments"},
        };
        /*
@@ -1927,6 +2115,9 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
                else if (!strcasecmp(arg_name, "read_only"))
                        pf->mode = PM_READ_ONLY;
+                else if (!strcasecmp(arg_name, "error_if_no_space"))
+                        pf->error_if_no_space = true;
                else {
                        ti->error = "Unrecognised pool feature requested";
                        r = -EINVAL;
@@ -1947,16 +2138,27 @@ static void metadata_low_callback(void *context)
        dm_table_event(pool->ti->table);
 }
-static sector_t get_metadata_dev_size(struct block_device *bdev)
+static sector_t get_dev_size(struct block_device *bdev)
 {
-        sector_t metadata_dev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
+        return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
+}
+static void warn_if_metadata_device_too_big(struct block_device *bdev)
+{
+        sector_t metadata_dev_size = get_dev_size(bdev);
        char buffer[BDEVNAME_SIZE];
-        if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) {
+        if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
                DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
                       bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS);
-                metadata_dev_size = THIN_METADATA_MAX_SECTORS_WARNING;
+}
-        }
+static sector_t get_metadata_dev_size(struct block_device *bdev)
+{
+        sector_t metadata_dev_size = get_dev_size(bdev);
+        if (metadata_dev_size > THIN_METADATA_MAX_SECTORS)
+                metadata_dev_size = THIN_METADATA_MAX_SECTORS;
        return metadata_dev_size;
 }
@@ -1965,7 +2167,7 @@ static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev)
 {
        sector_t metadata_dev_size = get_metadata_dev_size(bdev);
-        sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
+        sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE);
        return metadata_dev_size;
 }
@@ -1997,6 +2199,8 @@ static dm_block_t calc_metadata_threshold(struct pool_c *pt)
 *           skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
 *           ignore_discard: disable discard
 *           no_discard_passdown: don't pass discards down to the data device
+ *           read_only: Don't allow any changes to be made to the pool metadata.
+ *           error_if_no_space: error IOs, instead of queueing, if no space.
 */
 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
@@ -2041,12 +2245,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
                ti->error = "Error opening metadata block device";
                goto out_unlock;
        }
+        warn_if_metadata_device_too_big(metadata_dev->bdev);
-        /*
-         * Run for the side-effect of possibly issuing a warning if the
-         * device is too big.
-         */
-        (void) get_metadata_dev_size(metadata_dev->bdev);
        r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
        if (r) {
@@ -2192,11 +2391,19 @@ static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
                return -EINVAL;
        } else if (data_size > sb_data_size) {
+                if (dm_pool_metadata_needs_check(pool->pmd)) {
+                        DMERR("%s: unable to grow the data device until repaired.",
+                              dm_device_name(pool->pool_md));
+                        return 0;
+                }
+                if (sb_data_size)
+                        DMINFO("%s: growing the data device from %llu to %llu blocks",
+                               dm_device_name(pool->pool_md),
+                               sb_data_size, (unsigned long long)data_size);
                r = dm_pool_resize_data_dev(pool->pmd, data_size);
                if (r) {
-                        DMERR("%s: failed to resize data device",
+                        metadata_operation_failed(pool, "dm_pool_resize_data_dev", r);
-                              dm_device_name(pool->pool_md));
-                        set_pool_mode(pool, PM_READ_ONLY);
                        return r;
                }
@@ -2231,10 +2438,19 @@ static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
                return -EINVAL;
        } else if (metadata_dev_size > sb_metadata_dev_size) {
+                if (dm_pool_metadata_needs_check(pool->pmd)) {
+                        DMERR("%s: unable to grow the metadata device until repaired.",
+                              dm_device_name(pool->pool_md));
+                        return 0;
+                }
+                warn_if_metadata_device_too_big(pool->md_dev);
+                DMINFO("%s: growing the metadata device from %llu to %llu blocks",
+                       dm_device_name(pool->pool_md),
+                       sb_metadata_dev_size, metadata_dev_size);
                r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
                if (r) {
-                        DMERR("%s: failed to resize metadata device",
+                        metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
-                              dm_device_name(pool->pool_md));
                        return r;
                }
@@ -2290,8 +2506,7 @@ static void pool_resume(struct dm_target *ti)
        unsigned long flags;
        spin_lock_irqsave(&pool->lock, flags);
-        pool->low_water_triggered = 0;
+        pool->low_water_triggered = false;
-        pool->no_free_space = 0;
        __requeue_bios(pool);
        spin_unlock_irqrestore(&pool->lock, flags);
@@ -2510,7 +2725,8 @@ static void emit_flags(struct pool_features *pf, char *result,
                       unsigned sz, unsigned maxlen)
 {
        unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
-                !pf->discard_passdown + (pf->mode == PM_READ_ONLY);
+                !pf->discard_passdown + (pf->mode == PM_READ_ONLY) +
+                pf->error_if_no_space;
        DMEMIT("%u ", count);
        if (!pf->zero_new_blocks)
@@ -2524,6 +2740,9 @@ static void emit_flags(struct pool_features *pf, char *result,
        if (pf->mode == PM_READ_ONLY)
                DMEMIT("read_only ");
+        if (pf->error_if_no_space)
+                DMEMIT("error_if_no_space ");
 }
 /*
@@ -2612,17 +2831,24 @@ static void pool_status(struct dm_target *ti, status_type_t type,
                else
                        DMEMIT("- ");
-                if (pool->pf.mode == PM_READ_ONLY)
+                if (pool->pf.mode == PM_OUT_OF_DATA_SPACE)
+                        DMEMIT("out_of_data_space ");
+                else if (pool->pf.mode == PM_READ_ONLY)
                        DMEMIT("ro ");
                else
                        DMEMIT("rw ");
                if (!pool->pf.discard_enabled)
-                        DMEMIT("ignore_discard");
+                        DMEMIT("ignore_discard ");
                else if (pool->pf.discard_passdown)
-                        DMEMIT("discard_passdown");
+                        DMEMIT("discard_passdown ");
                else
-                        DMEMIT("no_discard_passdown");
+                        DMEMIT("no_discard_passdown ");
+                if (pool->pf.error_if_no_space)
+                        DMEMIT("error_if_no_space ");
+                else
+                        DMEMIT("queue_if_no_space ");
                break;
@@ -2721,7 +2947,7 @@ static struct target_type pool_target = {
        .name = "thin-pool",
        .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
                    DM_TARGET_IMMUTABLE,
-        .version = {1, 9, 0},
+        .version = {1, 11, 0},
        .module = THIS_MODULE,
        .ctr = pool_ctr,
        .dtr = pool_dtr,
@@ -2828,6 +3054,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
        if (get_pool_mode(tc->pool) == PM_FAIL) {
                ti->error = "Couldn't open thin device, Pool is in fail mode";
+                r = -EINVAL;
                goto bad_thin_open;
        }
@@ -2839,7 +3066,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
        r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
        if (r)
-                goto bad_thin_open;
+                goto bad_target_max_io_len;
        ti->num_flush_bios = 1;
        ti->flush_supported = true;
@@ -2860,6 +3087,8 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
        return 0;
+bad_target_max_io_len:
+        dm_pool_close_thin_device(tc->td);
 bad_thin_open:
        __pool_dec(tc->pool);
 bad_pool_lookup:
@@ -2879,7 +3108,7 @@ out_unlock:
 static int thin_map(struct dm_target *ti, struct bio *bio)
 {
-        bio->bi_sector = dm_target_offset(ti, bio->bi_sector);
+        bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
        return thin_bio_map(ti, bio);
 }
@@ -2899,7 +3128,7 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
                spin_lock_irqsave(&pool->lock, flags);
                list_for_each_entry_safe(m, tmp, &work, list) {
                        list_del(&m->list);
-                        m->quiesced = 1;
+                        m->quiesced = true;
                        __maybe_add_mapping(m);
                }
                spin_unlock_irqrestore(&pool->lock, flags);
@@ -2911,7 +3140,7 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
                if (!list_empty(&work)) {
                        spin_lock_irqsave(&pool->lock, flags);
                        list_for_each_entry_safe(m, tmp, &work, list)
-                                list_add(&m->list, &pool->prepared_discards);
+                                list_add_tail(&m->list, &pool->prepared_discards);
                        spin_unlock_irqrestore(&pool->lock, flags);
                        wake_worker(pool);
                }
@@ -2920,10 +3149,23 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
        return 0;
 }
-static void thin_postsuspend(struct dm_target *ti)
+static void thin_presuspend(struct dm_target *ti)
 {
+        struct thin_c *tc = ti->private;
        if (dm_noflush_suspending(ti))
-                requeue_io((struct thin_c *)ti->private);
+                noflush_work(tc, do_noflush_start);
+}
+static void thin_postsuspend(struct dm_target *ti)
+{
+        struct thin_c *tc = ti->private;
+        /*
+         * The dm_noflush_suspending flag has been cleared by now, so
+         * unfortunately we must always run this.
+         */
+        noflush_work(tc, do_noflush_stop);
 }
 /*
@@ -3008,12 +3250,13 @@ static int thin_iterate_devices(struct dm_target *ti,
 static struct target_type thin_target = {
        .name = "thin",
-        .version = {1, 9, 0},
+        .version = {1, 11, 0},
        .module = THIS_MODULE,
        .ctr = thin_ctr,
        .dtr = thin_dtr,
        .map = thin_map,
        .end_io = thin_endio,
+        .presuspend = thin_presuspend,
        .postsuspend = thin_postsuspend,
        .status = thin_status,
        .iterate_devices = thin_iterate_devices,
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 4b7941db3aff..796007a5e0e1 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -73,15 +73,10 @@ struct dm_verity_io {
        sector_t block;
        unsigned n_blocks;
-        /* saved bio vector */
+        struct bvec_iter iter;
-        struct bio_vec *io_vec;
-        unsigned io_vec_size;
        struct work_struct work;
-        /* A space for short vectors; longer vectors are allocated separately. */
-        struct bio_vec io_vec_inline[DM_VERITY_IO_VEC_INLINE];
        /*
         * Three variably-size fields follow this struct:
         *
@@ -284,9 +279,10 @@ release_ret_r:
 static int verity_verify_io(struct dm_verity_io *io)
 {
        struct dm_verity *v = io->v;
+        struct bio *bio = dm_bio_from_per_bio_data(io,
+                                                   v->ti->per_bio_data_size);
        unsigned b;
        int i;
-        unsigned vector = 0, offset = 0;
        for (b = 0; b < io->n_blocks; b++) {
                struct shash_desc *desc;
@@ -336,31 +332,22 @@ test_block_hash:
                }
                todo = 1 << v->data_dev_block_bits;
-                do {
+                while (io->iter.bi_size) {
-                        struct bio_vec *bv;
                        u8 *page;
-                        unsigned len;
+                        struct bio_vec bv = bio_iter_iovec(bio, io->iter);
-                        BUG_ON(vector >= io->io_vec_size);
+                        page = kmap_atomic(bv.bv_page);
-                        bv = &io->io_vec[vector];
+                        r = crypto_shash_update(desc, page + bv.bv_offset,
-                        page = kmap_atomic(bv->bv_page);
+                                                bv.bv_len);
-                        len = bv->bv_len - offset;
-                        if (likely(len >= todo))
-                                len = todo;
-                        r = crypto_shash_update(desc,
-                                        page + bv->bv_offset + offset, len);
                        kunmap_atomic(page);
                        if (r < 0) {
                                DMERR("crypto_shash_update failed: %d", r);
                                return r;
                        }
-                        offset += len;
-                        if (likely(offset == bv->bv_len)) {
+                        bio_advance_iter(bio, &io->iter, bv.bv_len);
-                                offset = 0;
+                }
-                                vector++;
-                        }
-                        todo -= len;
-                } while (todo);
                if (!v->version) {
                        r = crypto_shash_update(desc, v->salt, v->salt_size);
@@ -383,8 +370,6 @@ test_block_hash:
                        return -EIO;
                }
        }
-        BUG_ON(vector != io->io_vec_size);
-        BUG_ON(offset);
        return 0;
 }
@@ -400,10 +385,7 @@ static void verity_finish_io(struct dm_verity_io *io, int error)
        bio->bi_end_io = io->orig_bi_end_io;
        bio->bi_private = io->orig_bi_private;
-        if (io->io_vec != io->io_vec_inline)
+        bio_endio_nodec(bio, error);
-                mempool_free(io->io_vec, v->vec_mempool);
-        bio_endio(bio, error);
 }
 static void verity_work(struct work_struct *w)
@@ -493,9 +475,9 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
        struct dm_verity_io *io;
        bio->bi_bdev = v->data_dev->bdev;
-        bio->bi_sector = verity_map_sector(v, bio->bi_sector);
+        bio->bi_iter.bi_sector = verity_map_sector(v, bio->bi_iter.bi_sector);
-        if (((unsigned)bio->bi_sector | bio_sectors(bio)) &
+        if (((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) &
            ((1 << (v->data_dev_block_bits - SECTOR_SHIFT)) - 1)) {
                DMERR_LIMIT("unaligned io");
                return -EIO;
@@ -514,18 +496,12 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
        io->v = v;
        io->orig_bi_end_io = bio->bi_end_io;
        io->orig_bi_private = bio->bi_private;
-        io->block = bio->bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT);
+        io->block = bio->bi_iter.bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT);
-        io->n_blocks = bio->bi_size >> v->data_dev_block_bits;
+        io->n_blocks = bio->bi_iter.bi_size >> v->data_dev_block_bits;
        bio->bi_end_io = verity_end_io;
        bio->bi_private = io;
-        io->io_vec_size = bio_segments(bio);
+        io->iter = bio->bi_iter;
-        if (io->io_vec_size < DM_VERITY_IO_VEC_INLINE)
-                io->io_vec = io->io_vec_inline;
-        else
-                io->io_vec = mempool_alloc(v->vec_mempool, GFP_NOIO);
-        memcpy(io->io_vec, bio_iovec(bio),
-               io->io_vec_size * sizeof(struct bio_vec));
        verity_submit_prefetch(v, io);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 0704c523a76b..8c53b09b9a2c 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -200,8 +200,8 @@ struct mapped_device {
        /* forced geometry settings */
        struct hd_geometry geometry;
-        /* sysfs handle */
+        /* kobject and completion */
-        struct kobject kobj;
+        struct dm_kobject_holder kobj_holder;
        /* zero-length flush that will be cloned and submitted to targets */
        struct bio flush_bio;
@@ -575,7 +575,7 @@ static void start_io_acct(struct dm_io *io)
                atomic_inc_return(&md->pending[rw]));
        if (unlikely(dm_stats_used(&md->stats)))
-                dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector,
+                dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector,
                                    bio_sectors(bio), false, 0, &io->stats_aux);
 }
@@ -593,7 +593,7 @@ static void end_io_acct(struct dm_io *io)
        part_stat_unlock();
        if (unlikely(dm_stats_used(&md->stats)))
-                dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector,
+                dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector,
                                    bio_sectors(bio), true, duration, &io->stats_aux);
        /*
@@ -742,7 +742,7 @@ static void dec_pending(struct dm_io *io, int error)
                if (io_error == DM_ENDIO_REQUEUE)
                        return;
-                if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) {
+                if ((bio->bi_rw & REQ_FLUSH) && bio->bi_iter.bi_size) {
                        /*
                         * Preflush done for flush with data, reissue
                         * without REQ_FLUSH.
@@ -797,7 +797,7 @@ static void end_clone_bio(struct bio *clone, int error)
        struct dm_rq_clone_bio_info *info = clone->bi_private;
        struct dm_rq_target_io *tio = info->tio;
        struct bio *bio = info->orig;
-        unsigned int nr_bytes = info->orig->bi_size;
+        unsigned int nr_bytes = info->orig->bi_iter.bi_size;
        bio_put(clone);
@@ -1128,7 +1128,7 @@ static void __map_bio(struct dm_target_io *tio)
         * this io.
         */
        atomic_inc(&tio->io->io_count);
-        sector = clone->bi_sector;
+        sector = clone->bi_iter.bi_sector;
        r = ti->type->map(ti, clone);
        if (r == DM_MAPIO_REMAPPED) {
                /* the bio has been remapped so dispatch it */
@@ -1155,76 +1155,32 @@ struct clone_info {
        struct dm_io *io;
        sector_t sector;
        sector_t sector_count;
-        unsigned short idx;
 };
 static void bio_setup_sector(struct bio *bio, sector_t sector, sector_t len)
 {
-        bio->bi_sector = sector;
+        bio->bi_iter.bi_sector = sector;
-        bio->bi_size = to_bytes(len);
+        bio->bi_iter.bi_size = to_bytes(len);
-}
-static void bio_setup_bv(struct bio *bio, unsigned short idx, unsigned short bv_count)
-{
-        bio->bi_idx = idx;
-        bio->bi_vcnt = idx + bv_count;
-        bio->bi_flags &= ~(1 << BIO_SEG_VALID);
-}
-static void clone_bio_integrity(struct bio *bio, struct bio *clone,
-                                unsigned short idx, unsigned len, unsigned offset,
-                                unsigned trim)
-{
-        if (!bio_integrity(bio))
-                return;
-        bio_integrity_clone(clone, bio, GFP_NOIO);
-        if (trim)
-                bio_integrity_trim(clone, bio_sector_offset(bio, idx, offset), len);
-}
-/*
- * Creates a little bio that just does part of a bvec.
- */
-static void clone_split_bio(struct dm_target_io *tio, struct bio *bio,
-                            sector_t sector, unsigned short idx,
-                            unsigned offset, unsigned len)
-{
-        struct bio *clone = &tio->clone;
-        struct bio_vec *bv = bio->bi_io_vec + idx;
-        *clone->bi_io_vec = *bv;
-        bio_setup_sector(clone, sector, len);
-        clone->bi_bdev = bio->bi_bdev;
-        clone->bi_rw = bio->bi_rw;
-        clone->bi_vcnt = 1;
-        clone->bi_io_vec->bv_offset = offset;
-        clone->bi_io_vec->bv_len = clone->bi_size;
-        clone->bi_flags |= 1 << BIO_CLONED;
-        clone_bio_integrity(bio, clone, idx, len, offset, 1);
 }
 /*
 * Creates a bio that consists of range of complete bvecs.
 */
 static void clone_bio(struct dm_target_io *tio, struct bio *bio,
-                      sector_t sector, unsigned short idx,
+                      sector_t sector, unsigned len)
-                      unsigned short bv_count, unsigned len)
 {
        struct bio *clone = &tio->clone;
-        unsigned trim = 0;
-        __bio_clone(clone, bio);
+        __bio_clone_fast(clone, bio);
-        bio_setup_sector(clone, sector, len);
-        bio_setup_bv(clone, idx, bv_count);
+        if (bio_integrity(bio))
+                bio_integrity_clone(clone, bio, GFP_NOIO);
-        if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
+        bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
-                trim = 1;
+        clone->bi_iter.bi_size = to_bytes(len);
-        clone_bio_integrity(bio, clone, idx, len, 0, trim);
+        if (bio_integrity(bio))
+                bio_integrity_trim(clone, 0, len);
 }
 static struct dm_target_io *alloc_tio(struct clone_info *ci,
@@ -1257,7 +1213,7 @@ static void __clone_and_map_simple_bio(struct clone_info *ci,
         * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
         * and discard, so no need for concern about wasted bvec allocations.
         */
-         __bio_clone(clone, ci->bio);
+         __bio_clone_fast(clone, ci->bio);
        if (len)
                bio_setup_sector(clone, ci->sector, len);
@@ -1286,10 +1242,7 @@ static int __send_empty_flush(struct clone_info *ci)
 }
 static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
-                                     sector_t sector, int nr_iovecs,
+                                     sector_t sector, unsigned len)
-                                     unsigned short idx, unsigned short bv_count,
-                                     unsigned offset, unsigned len,
-                                     unsigned split_bvec)
 {
        struct bio *bio = ci->bio;
        struct dm_target_io *tio;
@@ -1303,11 +1256,8 @@ static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti
                num_target_bios = ti->num_write_bios(ti, bio);
        for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
-                tio = alloc_tio(ci, ti, nr_iovecs, target_bio_nr);
+                tio = alloc_tio(ci, ti, 0, target_bio_nr);
-                if (split_bvec)
+                clone_bio(tio, bio, sector, len);
-                        clone_split_bio(tio, bio, sector, idx, offset, len);
-                else
-                        clone_bio(tio, bio, sector, idx, bv_count, len);
                __map_bio(tio);
        }
 }
@@ -1379,68 +1329,13 @@ static int __send_write_same(struct clone_info *ci)
 }
 /*
- * Find maximum number of sectors / bvecs we can process with a single bio.
- */
-static sector_t __len_within_target(struct clone_info *ci, sector_t max, int *idx)
-{
-        struct bio *bio = ci->bio;
-        sector_t bv_len, total_len = 0;
-        for (*idx = ci->idx; max && (*idx < bio->bi_vcnt); (*idx)++) {
-                bv_len = to_sector(bio->bi_io_vec[*idx].bv_len);
-                if (bv_len > max)
-                        break;
-                max -= bv_len;
-                total_len += bv_len;
-        }
-        return total_len;
-}
-static int __split_bvec_across_targets(struct clone_info *ci,
-                                       struct dm_target *ti, sector_t max)
-{
-        struct bio *bio = ci->bio;
-        struct bio_vec *bv = bio->bi_io_vec + ci->idx;
-        sector_t remaining = to_sector(bv->bv_len);
-        unsigned offset = 0;
-        sector_t len;
-        do {
-                if (offset) {
-                        ti = dm_table_find_target(ci->map, ci->sector);
-                        if (!dm_target_is_valid(ti))
-                                return -EIO;
-                        max = max_io_len(ci->sector, ti);
-                }
-                len = min(remaining, max);
-                __clone_and_map_data_bio(ci, ti, ci->sector, 1, ci->idx, 0,
-                                         bv->bv_offset + offset, len, 1);
-                ci->sector += len;
-                ci->sector_count -= len;
-                offset += to_bytes(len);
-        } while (remaining -= len);
-        ci->idx++;
-        return 0;
-}
-/*
 * Select the correct strategy for processing a non-flush bio.
 */
 static int __split_and_process_non_flush(struct clone_info *ci)
 {
        struct bio *bio = ci->bio;
        struct dm_target *ti;
-        sector_t len, max;
+        unsigned len;
-        int idx;
        if (unlikely(bio->bi_rw & REQ_DISCARD))
                return __send_discard(ci);
@@ -1451,41 +1346,14 @@ static int __split_and_process_non_flush(struct clone_info *ci)
        if (!dm_target_is_valid(ti))
                return -EIO;
-        max = max_io_len(ci->sector, ti);
+        len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
-        /*
-         * Optimise for the simple case where we can do all of
-         * the remaining io with a single clone.
-         */
-        if (ci->sector_count <= max) {
-                __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs,
-                                         ci->idx, bio->bi_vcnt - ci->idx, 0,
-                                         ci->sector_count, 0);
-                ci->sector_count = 0;
-                return 0;
-        }
-        /*
+        __clone_and_map_data_bio(ci, ti, ci->sector, len);
-         * There are some bvecs that don't span targets.
-         * Do as many of these as possible.
-         */
-        if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
-                len = __len_within_target(ci, max, &idx);
-                __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs,
-                                         ci->idx, idx - ci->idx, 0, len, 0);
-                ci->sector += len;
+        ci->sector += len;
-                ci->sector_count -= len;
+        ci->sector_count -= len;
-                ci->idx = idx;
-                return 0;
+        return 0;
-        }
-        /*
-         * Handle a bvec that must be split between two or more targets.
-         */
-        return __split_bvec_across_targets(ci, ti, max);
 }
 /*
@@ -1510,8 +1378,7 @@ static void __split_and_process_bio(struct mapped_device *md,
        ci.io->bio = bio;
        ci.io->md = md;
        spin_lock_init(&ci.io->endio_lock);
-        ci.sector = bio->bi_sector;
+        ci.sector = bio->bi_iter.bi_sector;
-        ci.idx = bio->bi_idx;
        start_io_acct(ci.io);
@@ -2041,6 +1908,7 @@ static struct mapped_device *alloc_dev(int minor)
        init_waitqueue_head(&md->wait);
        INIT_WORK(&md->work, dm_wq_work);
        init_waitqueue_head(&md->eventq);
+        init_completion(&md->kobj_holder.completion);
        md->disk->major = _major;
        md->disk->first_minor = minor;
@@ -2902,20 +2770,14 @@ struct gendisk *dm_disk(struct mapped_device *md)
 struct kobject *dm_kobject(struct mapped_device *md)
 {
-        return &md->kobj;
+        return &md->kobj_holder.kobj;
 }
-/*
- * struct mapped_device should not be exported outside of dm.c
- * so use this check to verify that kobj is part of md structure
- */
 struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
 {
        struct mapped_device *md;
-        md = container_of(kobj, struct mapped_device, kobj);
+        md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
-        if (&md->kobj != kobj)
-                return NULL;
        if (test_bit(DMF_FREEING, &md->flags) ||
            dm_deleting_md(md))
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index c57ba550f69e..c4569f02f50f 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -15,6 +15,8 @@
 #include <linux/list.h>
 #include <linux/blkdev.h>
 #include <linux/hdreg.h>
+#include <linux/completion.h>
+#include <linux/kobject.h>
 #include "dm-stats.h"
@@ -148,12 +150,27 @@ void dm_interface_exit(void);
 /*
 * sysfs interface
 */
+struct dm_kobject_holder {
+        struct kobject kobj;
+        struct completion completion;
+};
+static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
+{
+        return &container_of(kobj, struct dm_kobject_holder, kobj)->completion;
+}
 int dm_sysfs_init(struct mapped_device *md);
 void dm_sysfs_exit(struct mapped_device *md);
 struct kobject *dm_kobject(struct mapped_device *md);
 struct mapped_device *dm_get_from_kobject(struct kobject *kobj);
 /*
+ * The kobject helper
+ */
+void dm_kobject_release(struct kobject *kobj);
+/*
 * Targets for linear and striped mappings
 */
 int dm_linear_init(void);
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 3193aefe982b..e8b4574956c7 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -74,8 +74,8 @@ static void faulty_fail(struct bio *bio, int error)
 {
        struct bio *b = bio->bi_private;
-        b->bi_size = bio->bi_size;
+        b->bi_iter.bi_size = bio->bi_iter.bi_size;
-        b->bi_sector = bio->bi_sector;
+        b->bi_iter.bi_sector = bio->bi_iter.bi_sector;
        bio_put(bio);
@@ -185,26 +185,31 @@ static void make_request(struct mddev *mddev, struct bio *bio)
                        return;
                }
-                if (check_sector(conf, bio->bi_sector, bio_end_sector(bio), WRITE))
+                if (check_sector(conf, bio->bi_iter.bi_sector,
+                                 bio_end_sector(bio), WRITE))
                        failit = 1;
                if (check_mode(conf, WritePersistent)) {
-                        add_sector(conf, bio->bi_sector, WritePersistent);
+                        add_sector(conf, bio->bi_iter.bi_sector,
+                                   WritePersistent);
                        failit = 1;
                }
                if (check_mode(conf, WriteTransient))
                        failit = 1;
        } else {
                /* read request */
-                if (check_sector(conf, bio->bi_sector, bio_end_sector(bio), READ))
+                if (check_sector(conf, bio->bi_iter.bi_sector,
+                                 bio_end_sector(bio), READ))
                        failit = 1;
                if (check_mode(conf, ReadTransient))
                        failit = 1;
                if (check_mode(conf, ReadPersistent)) {
-                        add_sector(conf, bio->bi_sector, ReadPersistent);
+                        add_sector(conf, bio->bi_iter.bi_sector,
+                                   ReadPersistent);
                        failit = 1;
                }
                if (check_mode(conf, ReadFixable)) {
-                        add_sector(conf, bio->bi_sector, ReadFixable);
+                        add_sector(conf, bio->bi_iter.bi_sector,
+                                   ReadFixable);
                        failit = 1;
                }
        }
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index f03fabd2b37b..56f534b4a2d2 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -288,65 +288,65 @@ static int linear_stop (struct mddev *mddev)
 static void linear_make_request(struct mddev *mddev, struct bio *bio)
 {
+        char b[BDEVNAME_SIZE];
        struct dev_info *tmp_dev;
-        sector_t start_sector;
+        struct bio *split;
+        sector_t start_sector, end_sector, data_offset;
        if (unlikely(bio->bi_rw & REQ_FLUSH)) {
                md_flush_request(mddev, bio);
                return;
        }
-        rcu_read_lock();
+        do {
-        tmp_dev = which_dev(mddev, bio->bi_sector);
+                rcu_read_lock();
-        start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
-        if (unlikely(bio->bi_sector >= (tmp_dev->end_sector)
-                     || (bio->bi_sector < start_sector))) {
-                char b[BDEVNAME_SIZE];
-                printk(KERN_ERR
-                       "md/linear:%s: make_request: Sector %llu out of bounds on "
-                       "dev %s: %llu sectors, offset %llu\n",
-                       mdname(mddev),
-                       (unsigned long long)bio->bi_sector,
-                       bdevname(tmp_dev->rdev->bdev, b),
-                       (unsigned long long)tmp_dev->rdev->sectors,
-                       (unsigned long long)start_sector);
-                rcu_read_unlock();
-                bio_io_error(bio);
-                return;
-        }
-        if (unlikely(bio_end_sector(bio) > tmp_dev->end_sector)) {
-                /* This bio crosses a device boundary, so we have to
-                 * split it.
-                 */
-                struct bio_pair *bp;
-                sector_t end_sector = tmp_dev->end_sector;
-                rcu_read_unlock();
+                tmp_dev = which_dev(mddev, bio->bi_iter.bi_sector);
+                start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
-                bp = bio_split(bio, end_sector - bio->bi_sector);
+                end_sector = tmp_dev->end_sector;
+                data_offset = tmp_dev->rdev->data_offset;
+                bio->bi_bdev = tmp_dev->rdev->bdev;
-                linear_make_request(mddev, &bp->bio1);
+                rcu_read_unlock();
-                linear_make_request(mddev, &bp->bio2);
-                bio_pair_release(bp);
-                return;
-        }
-                    
-        bio->bi_bdev = tmp_dev->rdev->bdev;
-        bio->bi_sector = bio->bi_sector - start_sector
-                + tmp_dev->rdev->data_offset;
-        rcu_read_unlock();
-        if (unlikely((bio->bi_rw & REQ_DISCARD) &&
+                if (unlikely(bio->bi_iter.bi_sector >= end_sector ||
-                     !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
+                             bio->bi_iter.bi_sector < start_sector))
-                /* Just ignore it */
+                        goto out_of_bounds;
-                bio_endio(bio, 0);
-                return;
+                if (unlikely(bio_end_sector(bio) > end_sector)) {
-        }
+                        /* This bio crosses a device boundary, so we have to
+                         * split it.
+                         */
+                        split = bio_split(bio, end_sector -
+                                          bio->bi_iter.bi_sector,
+                                          GFP_NOIO, fs_bio_set);
+                        bio_chain(split, bio);
+                } else {
+                        split = bio;
+                }
-        generic_make_request(bio);
+                split->bi_iter.bi_sector = split->bi_iter.bi_sector -
+                        start_sector + data_offset;
+                if (unlikely((split->bi_rw & REQ_DISCARD) &&
+                         !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
+                        /* Just ignore it */
+                        bio_endio(split, 0);
+                } else
+                        generic_make_request(split);
+        } while (split != bio);
+        return;
+out_of_bounds:
+        printk(KERN_ERR
+               "md/linear:%s: make_request: Sector %llu out of bounds on "
+               "dev %s: %llu sectors, offset %llu\n",
+               mdname(mddev),
+               (unsigned long long)bio->bi_iter.bi_sector,
+               bdevname(tmp_dev->rdev->bdev, b),
+               (unsigned long long)tmp_dev->rdev->sectors,
+               (unsigned long long)start_sector);
+        bio_io_error(bio);
 }
 static void linear_status (struct seq_file *seq, struct mddev *mddev)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 21f4d7ff0da2..4ad5cc4e63e8 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -393,7 +393,7 @@ static void md_submit_flush_data(struct work_struct *ws)
        struct mddev *mddev = container_of(ws, struct mddev, flush_work);
        struct bio *bio = mddev->flush_bio;
-        if (bio->bi_size == 0)
+        if (bio->bi_iter.bi_size == 0)
                /* an empty barrier - all done */
                bio_endio(bio, 0);
        else {
@@ -754,7 +754,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
        struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
        bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
-        bio->bi_sector = sector;
+        bio->bi_iter.bi_sector = sector;
        bio_add_page(bio, page, size, 0);
        bio->bi_private = rdev;
        bio->bi_end_io = super_written;
@@ -782,18 +782,16 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
        struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
        int ret;
-        rw |= REQ_SYNC;
        bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
                rdev->meta_bdev : rdev->bdev;
        if (metadata_op)
-                bio->bi_sector = sector + rdev->sb_start;
+                bio->bi_iter.bi_sector = sector + rdev->sb_start;
        else if (rdev->mddev->reshape_position != MaxSector &&
                 (rdev->mddev->reshape_backwards ==
                  (sector >= rdev->mddev->reshape_position)))
-                bio->bi_sector = sector + rdev->new_data_offset;
+                bio->bi_iter.bi_sector = sector + rdev->new_data_offset;
        else
-                bio->bi_sector = sector + rdev->data_offset;
+                bio->bi_iter.bi_sector = sector + rdev->data_offset;
        bio_add_page(bio, page, size, 0);
        submit_bio_wait(rw, bio);
@@ -1077,6 +1075,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
        rdev->raid_disk = -1;
        clear_bit(Faulty, &rdev->flags);
        clear_bit(In_sync, &rdev->flags);
+        clear_bit(Bitmap_sync, &rdev->flags);
        clear_bit(WriteMostly, &rdev->flags);
        if (mddev->raid_disks == 0) {
@@ -1155,6 +1154,8 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
                 */
                if (ev1 < mddev->bitmap->events_cleared)
                        return 0;
+                if (ev1 < mddev->events)
+                        set_bit(Bitmap_sync, &rdev->flags);
        } else {
                if (ev1 < mddev->events)
                        /* just a hot-add of a new device, leave raid_disk at -1 */
@@ -1170,6 +1171,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
                            desc->raid_disk < mddev->raid_disks */) {
                        set_bit(In_sync, &rdev->flags);
                        rdev->raid_disk = desc->raid_disk;
+                        rdev->saved_raid_disk = desc->raid_disk;
                } else if (desc->state & (1<<MD_DISK_ACTIVE)) {
                        /* active but not in sync implies recovery up to
                         * reshape position.  We don't know exactly where
@@ -1563,6 +1565,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
        rdev->raid_disk = -1;
        clear_bit(Faulty, &rdev->flags);
        clear_bit(In_sync, &rdev->flags);
+        clear_bit(Bitmap_sync, &rdev->flags);
        clear_bit(WriteMostly, &rdev->flags);
        if (mddev->raid_disks == 0) {
@@ -1645,6 +1648,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
                 */
                if (ev1 < mddev->bitmap->events_cleared)
                        return 0;
+                if (ev1 < mddev->events)
+                        set_bit(Bitmap_sync, &rdev->flags);
        } else {
                if (ev1 < mddev->events)
                        /* just a hot-add of a new device, leave raid_disk at -1 */
@@ -1665,10 +1670,14 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
                        set_bit(Faulty, &rdev->flags);
                        break;
                default:
+                        rdev->saved_raid_disk = role;
                        if ((le32_to_cpu(sb->feature_map) &
-                             MD_FEATURE_RECOVERY_OFFSET))
+                             MD_FEATURE_RECOVERY_OFFSET)) {
                                rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
-                        else
+                                if (!(le32_to_cpu(sb->feature_map) &
+                                      MD_FEATURE_RECOVERY_BITMAP))
+                                        rdev->saved_raid_disk = -1;
+                        } else
                                set_bit(In_sync, &rdev->flags);
                        rdev->raid_disk = role;
                        break;
@@ -1730,6 +1739,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
                        cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
                sb->recovery_offset =
                        cpu_to_le64(rdev->recovery_offset);
+                if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
+                        sb->feature_map |=
+                                cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
        }
        if (test_bit(Replacement, &rdev->flags))
                sb->feature_map |=
@@ -2471,8 +2483,7 @@ repeat:
                if (rdev->sb_loaded != 1)
                        continue; /* no noise on spare devices */
-                if (!test_bit(Faulty, &rdev->flags) &&
+                if (!test_bit(Faulty, &rdev->flags)) {
-                    rdev->saved_raid_disk == -1) {
                        md_super_write(mddev,rdev,
                                       rdev->sb_start, rdev->sb_size,
                                       rdev->sb_page);
@@ -2488,11 +2499,9 @@ repeat:
                                rdev->badblocks.size = 0;
                        }
-                } else if (test_bit(Faulty, &rdev->flags))
+                } else
                        pr_debug("md: %s (skipping faulty)\n",
                                 bdevname(rdev->bdev, b));
-                else
-                        pr_debug("(skipping incremental s/r ");
                if (mddev->level == LEVEL_MULTIPATH)
                        /* only need to write one superblock... */
@@ -2608,6 +2617,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
         *  blocked - sets the Blocked flags
         *  -blocked - clears the Blocked and possibly simulates an error
         *  insync - sets Insync providing device isn't active
+         *  -insync - clear Insync for a device with a slot assigned,
+         *            so that it gets rebuilt based on bitmap
         *  write_error - sets WriteErrorSeen
         *  -write_error - clears WriteErrorSeen
         */
@@ -2656,6 +2667,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
        } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
                set_bit(In_sync, &rdev->flags);
                err = 0;
+        } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) {
+                clear_bit(In_sync, &rdev->flags);
+                rdev->saved_raid_disk = rdev->raid_disk;
+                rdev->raid_disk = -1;
+                err = 0;
        } else if (cmd_match(buf, "write_error")) {
                set_bit(WriteErrorSeen, &rdev->flags);
                err = 0;
@@ -2788,6 +2804,7 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
                else
                        rdev->saved_raid_disk = -1;
                clear_bit(In_sync, &rdev->flags);
+                clear_bit(Bitmap_sync, &rdev->flags);
                err = rdev->mddev->pers->
                        hot_add_disk(rdev->mddev, rdev);
                if (err) {
@@ -3582,6 +3599,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
        pers->run(mddev);
        set_bit(MD_CHANGE_DEVS, &mddev->flags);
        mddev_resume(mddev);
+        if (!mddev->thread)
+                md_update_sb(mddev, 1);
        sysfs_notify(&mddev->kobj, NULL, "level");
        md_new_event(mddev);
        return rv;
@@ -5760,8 +5779,10 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info)
                            info->raid_disk < mddev->raid_disks) {
                                rdev->raid_disk = info->raid_disk;
                                set_bit(In_sync, &rdev->flags);
+                                clear_bit(Bitmap_sync, &rdev->flags);
                        } else
                                rdev->raid_disk = -1;
+                        rdev->saved_raid_disk = rdev->raid_disk;
                } else
                        super_types[mddev->major_version].
                                validate_super(mddev, rdev);
@@ -5774,11 +5795,6 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info)
                        return -EINVAL;
                }
-                if (test_bit(In_sync, &rdev->flags))
-                        rdev->saved_raid_disk = rdev->raid_disk;
-                else
-                        rdev->saved_raid_disk = -1;
                clear_bit(In_sync, &rdev->flags); /* just to be sure */
                if (info->state & (1<<MD_DISK_WRITEMOSTLY))
                        set_bit(WriteMostly, &rdev->flags);
@@ -6328,6 +6344,32 @@ static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
        return 0;
 }
+static inline bool md_ioctl_valid(unsigned int cmd)
+{
+        switch (cmd) {
+        case ADD_NEW_DISK:
+        case BLKROSET:
+        case GET_ARRAY_INFO:
+        case GET_BITMAP_FILE:
+        case GET_DISK_INFO:
+        case HOT_ADD_DISK:
+        case HOT_REMOVE_DISK:
+        case PRINT_RAID_DEBUG:
+        case RAID_AUTORUN:
+        case RAID_VERSION:
+        case RESTART_ARRAY_RW:
+        case RUN_ARRAY:
+        case SET_ARRAY_INFO:
+        case SET_BITMAP_FILE:
+        case SET_DISK_FAULTY:
+        case STOP_ARRAY:
+        case STOP_ARRAY_RO:
+                return true;
+        default:
+                return false;
+        }
+}
 static int md_ioctl(struct block_device *bdev, fmode_t mode,
                        unsigned int cmd, unsigned long arg)
 {
@@ -6336,6 +6378,9 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
        struct mddev *mddev = NULL;
        int ro;
+        if (!md_ioctl_valid(cmd))
+                return -ENOTTY;
        switch (cmd) {
        case RAID_VERSION:
        case GET_ARRAY_INFO:
@@ -7706,10 +7751,12 @@ static int remove_and_add_spares(struct mddev *mddev,
                if (test_bit(Faulty, &rdev->flags))
                        continue;
                if (mddev->ro &&
-                    rdev->saved_raid_disk < 0)
+                    ! (rdev->saved_raid_disk >= 0 &&
+                       !test_bit(Bitmap_sync, &rdev->flags)))
                        continue;
-                rdev->recovery_offset = 0;
+                if (rdev->saved_raid_disk < 0)
+                        rdev->recovery_offset = 0;
                if (mddev->pers->
                    hot_add_disk(mddev, rdev) == 0) {
                        if (sysfs_link_rdev(mddev, rdev))
@@ -7787,9 +7834,12 @@ void md_check_recovery(struct mddev *mddev)
                         * As we only add devices that are already in-sync,
                         * we can activate the spares immediately.
                         */
-                        clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
                        remove_and_add_spares(mddev, NULL);
-                        mddev->pers->spare_active(mddev);
+                        /* There is no thread, but we need to call
+                         * ->spare_active and clear saved_raid_disk
+                         */
+                        md_reap_sync_thread(mddev);
+                        clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
                        goto unlock;
                }
@@ -7926,14 +7976,10 @@ void md_reap_sync_thread(struct mddev *mddev)
                mddev->pers->finish_reshape(mddev);
        /* If array is no-longer degraded, then any saved_raid_disk
-         * information must be scrapped.  Also if any device is now
+         * information must be scrapped.
-         * In_sync we must scrape the saved_raid_disk for that device
-         * do the superblock for an incrementally recovered device
-         * written out.
         */
-        rdev_for_each(rdev, mddev)
+        if (!mddev->degraded)
-                if (!mddev->degraded ||
+                rdev_for_each(rdev, mddev)
-                    test_bit(In_sync, &rdev->flags))
                        rdev->saved_raid_disk = -1;
        md_update_sb(mddev, 1);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 2f5cc8a7ef3e..07bba96de260 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -106,7 +106,7 @@ struct md_rdev {
                                           */
        struct work_struct del_work;    /* used for delayed sysfs removal */
-        struct sysfs_dirent *sysfs_state; /* handle for 'state'
+        struct kernfs_node *sysfs_state; /* handle for 'state'
                                           * sysfs entry */
        struct badblocks {
@@ -129,6 +129,9 @@ struct md_rdev {
 enum flag_bits {
        Faulty,                 /* device is known to have a fault */
        In_sync,                /* device is in_sync with rest of array */
+        Bitmap_sync,            /* ..actually, not quite In_sync.  Need a
+                                 * bitmap-based recovery to get fully in sync
+                                 */
        Unmerged,               /* device is being added to array and should
                                 * be considerred for bvec_merge_fn but not
                                 * yet for actual IO
@@ -376,10 +379,10 @@ struct mddev {
        sector_t                        resync_max;     /* resync should pause
                                                         * when it gets here */
-        struct sysfs_dirent             *sysfs_state;   /* handle for 'array_state'
+        struct kernfs_node              *sysfs_state;   /* handle for 'array_state'
                                                         * file in sysfs.
                                                         */
-        struct sysfs_dirent             *sysfs_action;  /* handle for 'sync_action' */
+        struct kernfs_node              *sysfs_action;  /* handle for 'sync_action' */
        struct work_struct del_work;    /* used for delayed sysfs removal */
@@ -498,13 +501,13 @@ struct md_sysfs_entry {
 };
 extern struct attribute_group md_bitmap_group;
-static inline struct sysfs_dirent *sysfs_get_dirent_safe(struct sysfs_dirent *sd, char *name)
+static inline struct kernfs_node *sysfs_get_dirent_safe(struct kernfs_node *sd, char *name)
 {
        if (sd)
                return sysfs_get_dirent(sd, name);
        return sd;
 }
-static inline void sysfs_notify_dirent_safe(struct sysfs_dirent *sd)
+static inline void sysfs_notify_dirent_safe(struct kernfs_node *sd)
 {
        if (sd)
                sysfs_notify_dirent(sd);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 1642eae75a33..849ad39f547b 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -100,7 +100,7 @@ static void multipath_end_request(struct bio *bio, int error)
                md_error (mp_bh->mddev, rdev);
                printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n", 
                       bdevname(rdev->bdev,b), 
-                       (unsigned long long)bio->bi_sector);
+                       (unsigned long long)bio->bi_iter.bi_sector);
                multipath_reschedule_retry(mp_bh);
        } else
                multipath_end_bh_io(mp_bh, error);
@@ -132,7 +132,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
        multipath = conf->multipaths + mp_bh->path;
        mp_bh->bio = *bio;
-        mp_bh->bio.bi_sector += multipath->rdev->data_offset;
+        mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset;
        mp_bh->bio.bi_bdev = multipath->rdev->bdev;
        mp_bh->bio.bi_rw |= REQ_FAILFAST_TRANSPORT;
        mp_bh->bio.bi_end_io = multipath_end_request;
@@ -355,21 +355,22 @@ static void multipathd(struct md_thread *thread)
                spin_unlock_irqrestore(&conf->device_lock, flags);
                bio = &mp_bh->bio;
-                bio->bi_sector = mp_bh->master_bio->bi_sector;
+                bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector;
                
                if ((mp_bh->path = multipath_map (conf))<0) {
                        printk(KERN_ALERT "multipath: %s: unrecoverable IO read"
                                " error for block %llu\n",
                                bdevname(bio->bi_bdev,b),
-                                (unsigned long long)bio->bi_sector);
+                                (unsigned long long)bio->bi_iter.bi_sector);
                        multipath_end_bh_io(mp_bh, -EIO);
                } else {
                        printk(KERN_ERR "multipath: %s: redirecting sector %llu"
                                " to another IO path\n",
                                bdevname(bio->bi_bdev,b),
-                                (unsigned long long)bio->bi_sector);
+                                (unsigned long long)bio->bi_iter.bi_sector);
                        *bio = *(mp_bh->master_bio);
-                        bio->bi_sector += conf->multipaths[mp_bh->path].rdev->data_offset;
+                        bio->bi_iter.bi_sector +=
+                                conf->multipaths[mp_bh->path].rdev->data_offset;
                        bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev;
                        bio->bi_rw |= REQ_FAILFAST_TRANSPORT;
                        bio->bi_end_io = multipath_end_request;
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig
index 19b268795415..0c2dec7aec20 100644
--- a/drivers/md/persistent-data/Kconfig
+++ b/drivers/md/persistent-data/Kconfig
@@ -6,3 +6,13 @@ config DM_PERSISTENT_DATA
       ---help---
         Library providing immutable on-disk data structure support for
         device-mapper targets such as the thin provisioning target.
+config DM_DEBUG_BLOCK_STACK_TRACING
+       boolean "Keep stack trace of persistent data block lock holders"
+       depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA
+       select STACKTRACE
+       ---help---
+         Enable this for messages that may help debug problems with the
+         block manager locking used by thin provisioning and caching.
+         If unsure, say N.
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 064a3c271baa..455f79279a16 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -104,7 +104,7 @@ static int __check_holder(struct block_lock *lock)
        for (i = 0; i < MAX_HOLDERS; i++) {
                if (lock->holders[i] == current) {
-                        DMERR("recursive lock detected in pool metadata");
+                        DMERR("recursive lock detected in metadata");
 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
                        DMERR("previously held here:");
                        print_stack_trace(lock->traces + i, 4);
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index 468e371ee9b2..416060c25709 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -770,8 +770,8 @@ EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
 /*----------------------------------------------------------------*/
-static int find_highest_key(struct ro_spine *s, dm_block_t block,
+static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest,
-                            uint64_t *result_key, dm_block_t *next_block)
+                    uint64_t *result_key, dm_block_t *next_block)
 {
        int i, r;
        uint32_t flags;
@@ -788,7 +788,11 @@ static int find_highest_key(struct ro_spine *s, dm_block_t block,
                else
                        i--;
-                *result_key = le64_to_cpu(ro_node(s)->keys[i]);
+                if (find_highest)
+                        *result_key = le64_to_cpu(ro_node(s)->keys[i]);
+                else
+                        *result_key = le64_to_cpu(ro_node(s)->keys[0]);
                if (next_block || flags & INTERNAL_NODE)
                        block = value64(ro_node(s), i);
@@ -799,16 +803,16 @@ static int find_highest_key(struct ro_spine *s, dm_block_t block,
        return 0;
 }
-int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
+static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root,
-                              uint64_t *result_keys)
+                             bool find_highest, uint64_t *result_keys)
 {
        int r = 0, count = 0, level;
        struct ro_spine spine;
        init_ro_spine(&spine, info);
        for (level = 0; level < info->levels; level++) {
-                r = find_highest_key(&spine, root, result_keys + level,
+                r = find_key(&spine, root, find_highest, result_keys + level,
-                                     level == info->levels - 1 ? NULL : &root);
+                             level == info->levels - 1 ? NULL : &root);
                if (r == -ENODATA) {
                        r = 0;
                        break;
@@ -822,8 +826,23 @@ int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
        return r ? r : count;
 }
+int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
+                              uint64_t *result_keys)
+{
+        return dm_btree_find_key(info, root, true, result_keys);
+}
 EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
+int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root,
+                             uint64_t *result_keys)
+{
+        return dm_btree_find_key(info, root, false, result_keys);
+}
+EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key);
+/*----------------------------------------------------------------*/
 /*
 * FIXME: We shouldn't use a recursive algorithm when we have limited stack
 * space.  Also this only works for single level trees.
diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h
index 8672d159e0b5..dacfc34180b4 100644
--- a/drivers/md/persistent-data/dm-btree.h
+++ b/drivers/md/persistent-data/dm-btree.h
@@ -137,6 +137,14 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
 /*
 * Returns < 0 on failure.  Otherwise the number of key entries that have
 * been filled out.  Remember trees can have zero entries, and as such have
+ * no lowest key.
+ */
+int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root,
+                             uint64_t *result_keys);
+/*
+ * Returns < 0 on failure.  Otherwise the number of key entries that have
+ * been filled out.  Remember trees can have zero entries, and as such have
 * no highest key.
 */
 int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index 466a60bbd716..aacbe70c2c2e 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -245,6 +245,10 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks)
                return -EINVAL;
        }
+        /*
+         * We need to set this before the dm_tm_new_block() call below.
+         */
+        ll->nr_blocks = nr_blocks;
        for (i = old_blocks; i < blocks; i++) {
                struct dm_block *b;
                struct disk_index_entry idx;
@@ -252,6 +256,7 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks)
                r = dm_tm_new_block(ll->tm, &dm_sm_bitmap_validator, &b);
                if (r < 0)
                        return r;
                idx.blocknr = cpu_to_le64(dm_block_location(b));
                r = dm_tm_unlock(ll->tm, b);
@@ -266,7 +271,6 @@ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks)
                        return r;
        }
-        ll->nr_blocks = nr_blocks;
        return 0;
 }
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 58fc1eef7499..786b689bdfc7 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -91,6 +91,69 @@ struct block_op {
        dm_block_t block;
 };
+struct bop_ring_buffer {
+        unsigned begin;
+        unsigned end;
+        struct block_op bops[MAX_RECURSIVE_ALLOCATIONS + 1];
+};
+static void brb_init(struct bop_ring_buffer *brb)
+{
+        brb->begin = 0;
+        brb->end = 0;
+}
+static bool brb_empty(struct bop_ring_buffer *brb)
+{
+        return brb->begin == brb->end;
+}
+static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old)
+{
+        unsigned r = old + 1;
+        return (r >= (sizeof(brb->bops) / sizeof(*brb->bops))) ? 0 : r;
+}
+static int brb_push(struct bop_ring_buffer *brb,
+                    enum block_op_type type, dm_block_t b)
+{
+        struct block_op *bop;
+        unsigned next = brb_next(brb, brb->end);
+        /*
+         * We don't allow the last bop to be filled, this way we can
+         * differentiate between full and empty.
+         */
+        if (next == brb->begin)
+                return -ENOMEM;
+        bop = brb->bops + brb->end;
+        bop->type = type;
+        bop->block = b;
+        brb->end = next;
+        return 0;
+}
+static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result)
+{
+        struct block_op *bop;
+        if (brb_empty(brb))
+                return -ENODATA;
+        bop = brb->bops + brb->begin;
+        result->type = bop->type;
+        result->block = bop->block;
+        brb->begin = brb_next(brb, brb->begin);
+        return 0;
+}
+/*----------------------------------------------------------------*/
 struct sm_metadata {
        struct dm_space_map sm;
@@ -101,25 +164,20 @@ struct sm_metadata {
        unsigned recursion_count;
        unsigned allocated_this_transaction;
-        unsigned nr_uncommitted;
+        struct bop_ring_buffer uncommitted;
-        struct block_op uncommitted[MAX_RECURSIVE_ALLOCATIONS];
        struct threshold threshold;
 };
 static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b)
 {
-        struct block_op *op;
+        int r = brb_push(&smm->uncommitted, type, b);
-        if (smm->nr_uncommitted == MAX_RECURSIVE_ALLOCATIONS) {
+        if (r) {
                DMERR("too many recursive allocations");
                return -ENOMEM;
        }
-        op = smm->uncommitted + smm->nr_uncommitted++;
-        op->type = type;
-        op->block = b;
        return 0;
 }
@@ -158,11 +216,17 @@ static int out(struct sm_metadata *smm)
                return -ENOMEM;
        }
-        if (smm->recursion_count == 1 && smm->nr_uncommitted) {
+        if (smm->recursion_count == 1) {
-                while (smm->nr_uncommitted && !r) {
+                while (!brb_empty(&smm->uncommitted)) {
-                        smm->nr_uncommitted--;
+                        struct block_op bop;
-                        r = commit_bop(smm, smm->uncommitted +
-                                       smm->nr_uncommitted);
+                        r = brb_pop(&smm->uncommitted, &bop);
+                        if (r) {
+                                DMERR("bug in bop ring buffer");
+                                break;
+                        }
+                        r = commit_bop(smm, &bop);
                        if (r)
                                break;
                }
@@ -217,7 +281,8 @@ static int sm_metadata_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
 static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
                                 uint32_t *result)
 {
-        int r, i;
+        int r;
+        unsigned i;
        struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
        unsigned adjustment = 0;
@@ -225,8 +290,10 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
         * We may have some uncommitted adjustments to add.  This list
         * should always be really short.
         */
-        for (i = 0; i < smm->nr_uncommitted; i++) {
+        for (i = smm->uncommitted.begin;
-                struct block_op *op = smm->uncommitted + i;
+             i != smm->uncommitted.end;
+             i = brb_next(&smm->uncommitted, i)) {
+                struct block_op *op = smm->uncommitted.bops + i;
                if (op->block != b)
                        continue;
@@ -254,7 +321,8 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
 static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
                                              dm_block_t b, int *result)
 {
-        int r, i, adjustment = 0;
+        int r, adjustment = 0;
+        unsigned i;
        struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
        uint32_t rc;
@@ -262,8 +330,11 @@ static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
         * We may have some uncommitted adjustments to add.  This list
         * should always be really short.
         */
-        for (i = 0; i < smm->nr_uncommitted; i++) {
+        for (i = smm->uncommitted.begin;
-                struct block_op *op = smm->uncommitted + i;
+             i != smm->uncommitted.end;
+             i = brb_next(&smm->uncommitted, i)) {
+                struct block_op *op = smm->uncommitted.bops + i;
                if (op->block != b)
                        continue;
@@ -385,13 +456,13 @@ static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b)
        int r = sm_metadata_new_block_(sm, b);
        if (r) {
-                DMERR("unable to allocate new metadata block");
+                DMERR_LIMIT("unable to allocate new metadata block");
                return r;
        }
        r = sm_metadata_get_nr_free(sm, &count);
        if (r) {
-                DMERR("couldn't get free block count");
+                DMERR_LIMIT("couldn't get free block count");
                return r;
        }
@@ -608,20 +679,38 @@ static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
         * Flick into a mode where all blocks get allocated in the new area.
         */
        smm->begin = old_len;
-        memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm));
+        memcpy(sm, &bootstrap_ops, sizeof(*sm));
        /*
         * Extend.
         */
        r = sm_ll_extend(&smm->ll, extra_blocks);
+        if (r)
+                goto out;
        /*
-         * Switch back to normal behaviour.
+         * We repeatedly increment then commit until the commit doesn't
+         * allocate any new blocks.
         */
-        memcpy(&smm->sm, &ops, sizeof(smm->sm));
+        do {
-        for (i = old_len; !r && i < smm->begin; i++)
+                for (i = old_len; !r && i < smm->begin; i++) {
-                r = sm_ll_inc(&smm->ll, i, &ev);
+                        r = sm_ll_inc(&smm->ll, i, &ev);
+                        if (r)
+                                goto out;
+                }
+                old_len = smm->begin;
+                r = sm_ll_commit(&smm->ll);
+                if (r)
+                        goto out;
+        } while (old_len != smm->begin);
+out:
+        /*
+         * Switch back to normal behaviour.
+         */
+        memcpy(sm, &ops, sizeof(*sm));
        return r;
 }
@@ -653,7 +742,7 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
        smm->begin = superblock + 1;
        smm->recursion_count = 0;
        smm->allocated_this_transaction = 0;
-        smm->nr_uncommitted = 0;
+        brb_init(&smm->uncommitted);
        threshold_init(&smm->threshold);
        memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm));
@@ -662,6 +751,8 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
        if (r)
                return r;
+        if (nr_blocks > DM_SM_METADATA_MAX_BLOCKS)
+                nr_blocks = DM_SM_METADATA_MAX_BLOCKS;
        r = sm_ll_extend(&smm->ll, nr_blocks);
        if (r)
                return r;
@@ -695,7 +786,7 @@ int dm_sm_metadata_open(struct dm_space_map *sm,
        smm->begin = 0;
        smm->recursion_count = 0;
        smm->allocated_this_transaction = 0;
-        smm->nr_uncommitted = 0;
+        brb_init(&smm->uncommitted);
        threshold_init(&smm->threshold);
        memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll));
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.h b/drivers/md/persistent-data/dm-space-map-metadata.h
index 39bba0801cf2..64df923974d8 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.h
+++ b/drivers/md/persistent-data/dm-space-map-metadata.h
@@ -9,6 +9,17 @@
 #include "dm-transaction-manager.h"
+#define DM_SM_METADATA_BLOCK_SIZE (4096 >> SECTOR_SHIFT)
+/*
+ * The metadata device is currently limited in size.
+ *
+ * We have one block of index, which can hold 255 index entries.  Each
+ * index entry contains allocation info about ~16k metadata blocks.
+ */
+#define DM_SM_METADATA_MAX_BLOCKS (255 * ((1 << 14) - 64))
+#define DM_SM_METADATA_MAX_SECTORS (DM_SM_METADATA_MAX_BLOCKS * DM_SM_METADATA_BLOCK_SIZE)
 /*
 * Unfortunately we have to use two-phase construction due to the cycle
 * between the tm and sm.
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index c4d420b7d2f4..407a99e46f69 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -501,10 +501,11 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev,
                        unsigned int chunk_sects, struct bio *bio)
 {
        if (likely(is_power_of_2(chunk_sects))) {
-                return chunk_sects >= ((bio->bi_sector & (chunk_sects-1))
+                return chunk_sects >=
+                        ((bio->bi_iter.bi_sector & (chunk_sects-1))
                                        + bio_sectors(bio));
        } else{
-                sector_t sector = bio->bi_sector;
+                sector_t sector = bio->bi_iter.bi_sector;
                return chunk_sects >= (sector_div(sector, chunk_sects)
                                                + bio_sectors(bio));
        }
@@ -512,64 +513,44 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev,
 static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 {
-        unsigned int chunk_sects;
-        sector_t sector_offset;
        struct strip_zone *zone;
        struct md_rdev *tmp_dev;
+        struct bio *split;
        if (unlikely(bio->bi_rw & REQ_FLUSH)) {
                md_flush_request(mddev, bio);
                return;
        }
-        chunk_sects = mddev->chunk_sectors;
+        do {
-        if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) {
+                sector_t sector = bio->bi_iter.bi_sector;
-                sector_t sector = bio->bi_sector;
+                unsigned chunk_sects = mddev->chunk_sectors;
-                struct bio_pair *bp;
-                /* Sanity check -- queue functions should prevent this happening */
-                if (bio_segments(bio) > 1)
-                        goto bad_map;
-                /* This is a one page bio that upper layers
-                 * refuse to split for us, so we need to split it.
-                 */
-                if (likely(is_power_of_2(chunk_sects)))
-                        bp = bio_split(bio, chunk_sects - (sector &
-                                                           (chunk_sects-1)));
-                else
-                        bp = bio_split(bio, chunk_sects -
-                                       sector_div(sector, chunk_sects));
-                raid0_make_request(mddev, &bp->bio1);
-                raid0_make_request(mddev, &bp->bio2);
-                bio_pair_release(bp);
-                return;
-        }
-        sector_offset = bio->bi_sector;
+                unsigned sectors = chunk_sects -
-        zone = find_zone(mddev->private, &sector_offset);
+                        (likely(is_power_of_2(chunk_sects))
-        tmp_dev = map_sector(mddev, zone, bio->bi_sector,
+                         ? (sector & (chunk_sects-1))
-                             &sector_offset);
+                         : sector_div(sector, chunk_sects));
-        bio->bi_bdev = tmp_dev->bdev;
-        bio->bi_sector = sector_offset + zone->dev_start +
-                tmp_dev->data_offset;
-        if (unlikely((bio->bi_rw & REQ_DISCARD) &&
-                     !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
-                /* Just ignore it */
-                bio_endio(bio, 0);
-                return;
-        }
-        generic_make_request(bio);
+                if (sectors < bio_sectors(bio)) {
-        return;
+                        split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
+                        bio_chain(split, bio);
-bad_map:
+                } else {
-        printk("md/raid0:%s: make_request bug: can't convert block across chunks"
+                        split = bio;
-               " or bigger than %dk %llu %d\n",
+                }
-               mdname(mddev), chunk_sects / 2,
-               (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2);
-        bio_io_error(bio);
+                zone = find_zone(mddev->private, &sector);
-        return;
+                tmp_dev = map_sector(mddev, zone, sector, &sector);
+                split->bi_bdev = tmp_dev->bdev;
+                split->bi_iter.bi_sector = sector + zone->dev_start +
+                        tmp_dev->data_offset;
+                if (unlikely((split->bi_rw & REQ_DISCARD) &&
+                         !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
+                        /* Just ignore it */
+                        bio_endio(split, 0);
+                } else
+                        generic_make_request(split);
+        } while (split != bio);
 }
 static void raid0_status(struct seq_file *seq, struct mddev *mddev)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 1e5a540995e9..4a6ca1cb2e78 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -229,7 +229,7 @@ static void call_bio_endio(struct r1bio *r1_bio)
        int done;
        struct r1conf *conf = r1_bio->mddev->private;
        sector_t start_next_window = r1_bio->start_next_window;
-        sector_t bi_sector = bio->bi_sector;
+        sector_t bi_sector = bio->bi_iter.bi_sector;
        if (bio->bi_phys_segments) {
                unsigned long flags;
@@ -265,9 +265,8 @@ static void raid_end_bio_io(struct r1bio *r1_bio)
        if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
                pr_debug("raid1: sync end %s on sectors %llu-%llu\n",
                         (bio_data_dir(bio) == WRITE) ? "write" : "read",
-                         (unsigned long long) bio->bi_sector,
+                         (unsigned long long) bio->bi_iter.bi_sector,
-                         (unsigned long long) bio->bi_sector +
+                         (unsigned long long) bio_end_sector(bio) - 1);
-                         bio_sectors(bio) - 1);
                call_bio_endio(r1_bio);
        }
@@ -466,9 +465,8 @@ static void raid1_end_write_request(struct bio *bio, int error)
                                struct bio *mbio = r1_bio->master_bio;
                                pr_debug("raid1: behind end write sectors"
                                         " %llu-%llu\n",
-                                         (unsigned long long) mbio->bi_sector,
+                                         (unsigned long long) mbio->bi_iter.bi_sector,
-                                         (unsigned long long) mbio->bi_sector +
+                                         (unsigned long long) bio_end_sector(mbio) - 1);
-                                         bio_sectors(mbio) - 1);
                                call_bio_endio(r1_bio);
                        }
                }
@@ -875,7 +873,7 @@ static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
                else if ((conf->next_resync - RESYNC_WINDOW_SECTORS
                                >= bio_end_sector(bio)) ||
                         (conf->next_resync + NEXT_NORMALIO_DISTANCE
-                                <= bio->bi_sector))
+                                <= bio->bi_iter.bi_sector))
                        wait = false;
                else
                        wait = true;
@@ -913,20 +911,19 @@ static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
        if (bio && bio_data_dir(bio) == WRITE) {
                if (conf->next_resync + NEXT_NORMALIO_DISTANCE
-                    <= bio->bi_sector) {
+                    <= bio->bi_iter.bi_sector) {
                        if (conf->start_next_window == MaxSector)
                                conf->start_next_window =
                                        conf->next_resync +
                                        NEXT_NORMALIO_DISTANCE;
                        if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
-                            <= bio->bi_sector)
+                            <= bio->bi_iter.bi_sector)
                                conf->next_window_requests++;
                        else
                                conf->current_window_requests++;
-                }
-                if (bio->bi_sector >= conf->start_next_window)
                        sector = conf->start_next_window;
+                }
        }
        conf->nr_pending++;
@@ -1028,7 +1025,8 @@ do_sync_io:
                if (bvecs[i].bv_page)
                        put_page(bvecs[i].bv_page);
        kfree(bvecs);
-        pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
+        pr_debug("%dB behind alloc failed, doing sync I/O\n",
+                 bio->bi_iter.bi_size);
 }
 struct raid1_plug_cb {
@@ -1108,7 +1106,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
        if (bio_data_dir(bio) == WRITE &&
            bio_end_sector(bio) > mddev->suspend_lo &&
-            bio->bi_sector < mddev->suspend_hi) {
+            bio->bi_iter.bi_sector < mddev->suspend_hi) {
                /* As the suspend_* range is controlled by
                 * userspace, we want an interruptible
                 * wait.
@@ -1119,7 +1117,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
                        prepare_to_wait(&conf->wait_barrier,
                                        &w, TASK_INTERRUPTIBLE);
                        if (bio_end_sector(bio) <= mddev->suspend_lo ||
-                            bio->bi_sector >= mddev->suspend_hi)
+                            bio->bi_iter.bi_sector >= mddev->suspend_hi)
                                break;
                        schedule();
                }
@@ -1141,7 +1139,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
        r1_bio->sectors = bio_sectors(bio);
        r1_bio->state = 0;
        r1_bio->mddev = mddev;
-        r1_bio->sector = bio->bi_sector;
+        r1_bio->sector = bio->bi_iter.bi_sector;
        /* We might need to issue multiple reads to different
         * devices if there are bad blocks around, so we keep
@@ -1181,12 +1179,13 @@ read_again:
                r1_bio->read_disk = rdisk;
                read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-                bio_trim(read_bio, r1_bio->sector - bio->bi_sector,
+                bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
                         max_sectors);
                r1_bio->bios[rdisk] = read_bio;
-                read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
+                read_bio->bi_iter.bi_sector = r1_bio->sector +
+                        mirror->rdev->data_offset;
                read_bio->bi_bdev = mirror->rdev->bdev;
                read_bio->bi_end_io = raid1_end_read_request;
                read_bio->bi_rw = READ | do_sync;
@@ -1198,7 +1197,7 @@ read_again:
                         */
                        sectors_handled = (r1_bio->sector + max_sectors
-                                           - bio->bi_sector);
+                                           - bio->bi_iter.bi_sector);
                        r1_bio->sectors = max_sectors;
                        spin_lock_irq(&conf->device_lock);
                        if (bio->bi_phys_segments == 0)
@@ -1219,7 +1218,8 @@ read_again:
                        r1_bio->sectors = bio_sectors(bio) - sectors_handled;
                        r1_bio->state = 0;
                        r1_bio->mddev = mddev;
-                        r1_bio->sector = bio->bi_sector + sectors_handled;
+                        r1_bio->sector = bio->bi_iter.bi_sector +
+                                sectors_handled;
                        goto read_again;
                } else
                        generic_make_request(read_bio);
@@ -1322,7 +1322,7 @@ read_again:
                        if (r1_bio->bios[j])
                                rdev_dec_pending(conf->mirrors[j].rdev, mddev);
                r1_bio->state = 0;
-                allow_barrier(conf, start_next_window, bio->bi_sector);
+                allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector);
                md_wait_for_blocked_rdev(blocked_rdev, mddev);
                start_next_window = wait_barrier(conf, bio);
                /*
@@ -1349,7 +1349,7 @@ read_again:
                        bio->bi_phys_segments++;
                spin_unlock_irq(&conf->device_lock);
        }
-        sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector;
+        sectors_handled = r1_bio->sector + max_sectors - bio->bi_iter.bi_sector;
        atomic_set(&r1_bio->remaining, 1);
        atomic_set(&r1_bio->behind_remaining, 0);
@@ -1361,7 +1361,7 @@ read_again:
                        continue;
                mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-                bio_trim(mbio, r1_bio->sector - bio->bi_sector, max_sectors);
+                bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector, max_sectors);
                if (first_clone) {
                        /* do behind I/O ?
@@ -1395,7 +1395,7 @@ read_again:
                r1_bio->bios[i] = mbio;
-                mbio->bi_sector = (r1_bio->sector +
+                mbio->bi_iter.bi_sector = (r1_bio->sector +
                                   conf->mirrors[i].rdev->data_offset);
                mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
                mbio->bi_end_io = raid1_end_write_request;
@@ -1435,7 +1435,7 @@ read_again:
                r1_bio->sectors = bio_sectors(bio) - sectors_handled;
                r1_bio->state = 0;
                r1_bio->mddev = mddev;
-                r1_bio->sector = bio->bi_sector + sectors_handled;
+                r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
                goto retry_write;
        }
@@ -1953,20 +1953,24 @@ static int process_checks(struct r1bio *r1_bio)
        for (i = 0; i < conf->raid_disks * 2; i++) {
                int j;
                int size;
+                int uptodate;
                struct bio *b = r1_bio->bios[i];
                if (b->bi_end_io != end_sync_read)
                        continue;
-                /* fixup the bio for reuse */
+                /* fixup the bio for reuse, but preserve BIO_UPTODATE */
+                uptodate = test_bit(BIO_UPTODATE, &b->bi_flags);
                bio_reset(b);
+                if (!uptodate)
+                        clear_bit(BIO_UPTODATE, &b->bi_flags);
                b->bi_vcnt = vcnt;
-                b->bi_size = r1_bio->sectors << 9;
+                b->bi_iter.bi_size = r1_bio->sectors << 9;
-                b->bi_sector = r1_bio->sector +
+                b->bi_iter.bi_sector = r1_bio->sector +
                        conf->mirrors[i].rdev->data_offset;
                b->bi_bdev = conf->mirrors[i].rdev->bdev;
                b->bi_end_io = end_sync_read;
                b->bi_private = r1_bio;
-                size = b->bi_size;
+                size = b->bi_iter.bi_size;
                for (j = 0; j < vcnt ; j++) {
                        struct bio_vec *bi;
                        bi = &b->bi_io_vec[j];
@@ -1990,11 +1994,14 @@ static int process_checks(struct r1bio *r1_bio)
                int j;
                struct bio *pbio = r1_bio->bios[primary];
                struct bio *sbio = r1_bio->bios[i];
+                int uptodate = test_bit(BIO_UPTODATE, &sbio->bi_flags);
                if (sbio->bi_end_io != end_sync_read)
                        continue;
+                /* Now we can 'fixup' the BIO_UPTODATE flag */
+                set_bit(BIO_UPTODATE, &sbio->bi_flags);
-                if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
+                if (uptodate) {
                        for (j = vcnt; j-- ; ) {
                                struct page *p, *s;
                                p = pbio->bi_io_vec[j].bv_page;
@@ -2009,7 +2016,7 @@ static int process_checks(struct r1bio *r1_bio)
                if (j >= 0)
                        atomic64_add(r1_bio->sectors, &mddev->resync_mismatches);
                if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
-                              && test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
+                              && uptodate)) {
                        /* No need to write to this device. */
                        sbio->bi_end_io = NULL;
                        rdev_dec_pending(conf->mirrors[i].rdev, mddev);
@@ -2221,11 +2228,11 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
                }
                wbio->bi_rw = WRITE;
-                wbio->bi_sector = r1_bio->sector;
+                wbio->bi_iter.bi_sector = r1_bio->sector;
-                wbio->bi_size = r1_bio->sectors << 9;
+                wbio->bi_iter.bi_size = r1_bio->sectors << 9;
                bio_trim(wbio, sector - r1_bio->sector, sectors);
-                wbio->bi_sector += rdev->data_offset;
+                wbio->bi_iter.bi_sector += rdev->data_offset;
                wbio->bi_bdev = rdev->bdev;
                if (submit_bio_wait(WRITE, wbio) == 0)
                        /* failure! */
@@ -2339,7 +2346,8 @@ read_more:
                }
                r1_bio->read_disk = disk;
                bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
-                bio_trim(bio, r1_bio->sector - bio->bi_sector, max_sectors);
+                bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector,
+                         max_sectors);
                r1_bio->bios[r1_bio->read_disk] = bio;
                rdev = conf->mirrors[disk].rdev;
                printk_ratelimited(KERN_ERR
@@ -2348,7 +2356,7 @@ read_more:
                                   mdname(mddev),
                                   (unsigned long long)r1_bio->sector,
                                   bdevname(rdev->bdev, b));
-                bio->bi_sector = r1_bio->sector + rdev->data_offset;
+                bio->bi_iter.bi_sector = r1_bio->sector + rdev->data_offset;
                bio->bi_bdev = rdev->bdev;
                bio->bi_end_io = raid1_end_read_request;
                bio->bi_rw = READ | do_sync;
@@ -2357,7 +2365,7 @@ read_more:
                        /* Drat - have to split this up more */
                        struct bio *mbio = r1_bio->master_bio;
                        int sectors_handled = (r1_bio->sector + max_sectors
-                                               - mbio->bi_sector);
+                                               - mbio->bi_iter.bi_sector);
                        r1_bio->sectors = max_sectors;
                        spin_lock_irq(&conf->device_lock);
                        if (mbio->bi_phys_segments == 0)
@@ -2375,7 +2383,8 @@ read_more:
                        r1_bio->state = 0;
                        set_bit(R1BIO_ReadError, &r1_bio->state);
                        r1_bio->mddev = mddev;
-                        r1_bio->sector = mbio->bi_sector + sectors_handled;
+                        r1_bio->sector = mbio->bi_iter.bi_sector +
+                                sectors_handled;
                        goto read_more;
                } else
@@ -2599,7 +2608,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
                }
                if (bio->bi_end_io) {
                        atomic_inc(&rdev->nr_pending);
-                        bio->bi_sector = sector_nr + rdev->data_offset;
+                        bio->bi_iter.bi_sector = sector_nr + rdev->data_offset;
                        bio->bi_bdev = rdev->bdev;
                        bio->bi_private = r1_bio;
                }
@@ -2699,7 +2708,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
                                                        continue;
                                                /* remove last page from this bio */
                                                bio->bi_vcnt--;
-                                                bio->bi_size -= len;
+                                                bio->bi_iter.bi_size -= len;
                                                bio->bi_flags &= ~(1<< BIO_SEG_VALID);
                                        }
                                        goto bio_full;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index c504e8389e69..33fc408e5eac 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1152,14 +1152,12 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
        kfree(plug);
 }
-static void make_request(struct mddev *mddev, struct bio * bio)
+static void __make_request(struct mddev *mddev, struct bio *bio)
 {
        struct r10conf *conf = mddev->private;
        struct r10bio *r10_bio;
        struct bio *read_bio;
        int i;
-        sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
-        int chunk_sects = chunk_mask + 1;
        const int rw = bio_data_dir(bio);
        const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
        const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
@@ -1174,88 +1172,27 @@ static void make_request(struct mddev *mddev, struct bio * bio)
        int max_sectors;
        int sectors;
-        if (unlikely(bio->bi_rw & REQ_FLUSH)) {
-                md_flush_request(mddev, bio);
-                return;
-        }
-        /* If this request crosses a chunk boundary, we need to
-         * split it.  This will only happen for 1 PAGE (or less) requests.
-         */
-        if (unlikely((bio->bi_sector & chunk_mask) + bio_sectors(bio)
-                     > chunk_sects
-                     && (conf->geo.near_copies < conf->geo.raid_disks
-                         || conf->prev.near_copies < conf->prev.raid_disks))) {
-                struct bio_pair *bp;
-                /* Sanity check -- queue functions should prevent this happening */
-                if (bio_segments(bio) > 1)
-                        goto bad_map;
-                /* This is a one page bio that upper layers
-                 * refuse to split for us, so we need to split it.
-                 */
-                bp = bio_split(bio,
-                               chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
-                /* Each of these 'make_request' calls will call 'wait_barrier'.
-                 * If the first succeeds but the second blocks due to the resync
-                 * thread raising the barrier, we will deadlock because the
-                 * IO to the underlying device will be queued in generic_make_request
-                 * and will never complete, so will never reduce nr_pending.
-                 * So increment nr_waiting here so no new raise_barriers will
-                 * succeed, and so the second wait_barrier cannot block.
-                 */
-                spin_lock_irq(&conf->resync_lock);
-                conf->nr_waiting++;
-                spin_unlock_irq(&conf->resync_lock);
-                make_request(mddev, &bp->bio1);
-                make_request(mddev, &bp->bio2);
-                spin_lock_irq(&conf->resync_lock);
-                conf->nr_waiting--;
-                wake_up(&conf->wait_barrier);
-                spin_unlock_irq(&conf->resync_lock);
-                bio_pair_release(bp);
-                return;
-        bad_map:
-                printk("md/raid10:%s: make_request bug: can't convert block across chunks"
-                       " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
-                       (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2);
-                bio_io_error(bio);
-                return;
-        }
-        md_write_start(mddev, bio);
-        /*
-         * Register the new request and wait if the reconstruction
-         * thread has put up a bar for new requests.
-         * Continue immediately if no resync is active currently.
-         */
-        wait_barrier(conf);
        sectors = bio_sectors(bio);
        while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
-            bio->bi_sector < conf->reshape_progress &&
+            bio->bi_iter.bi_sector < conf->reshape_progress &&
-            bio->bi_sector + sectors > conf->reshape_progress) {
+            bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
                /* IO spans the reshape position.  Need to wait for
                 * reshape to pass
                 */
                allow_barrier(conf);
                wait_event(conf->wait_barrier,
-                           conf->reshape_progress <= bio->bi_sector ||
+                           conf->reshape_progress <= bio->bi_iter.bi_sector ||
-                           conf->reshape_progress >= bio->bi_sector + sectors);
+                           conf->reshape_progress >= bio->bi_iter.bi_sector +
+                           sectors);
                wait_barrier(conf);
        }
        if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
            bio_data_dir(bio) == WRITE &&
            (mddev->reshape_backwards
-             ? (bio->bi_sector < conf->reshape_safe &&
+             ? (bio->bi_iter.bi_sector < conf->reshape_safe &&
-                bio->bi_sector + sectors > conf->reshape_progress)
+                bio->bi_iter.bi_sector + sectors > conf->reshape_progress)
-             : (bio->bi_sector + sectors > conf->reshape_safe &&
+             : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe &&
-                bio->bi_sector < conf->reshape_progress))) {
+                bio->bi_iter.bi_sector < conf->reshape_progress))) {
                /* Need to update reshape_position in metadata */
                mddev->reshape_position = conf->reshape_progress;
                set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -1273,7 +1210,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
        r10_bio->sectors = sectors;
        r10_bio->mddev = mddev;
-        r10_bio->sector = bio->bi_sector;
+        r10_bio->sector = bio->bi_iter.bi_sector;
        r10_bio->state = 0;
        /* We might need to issue multiple reads to different
@@ -1302,13 +1239,13 @@ read_again:
                slot = r10_bio->read_slot;
                read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-                bio_trim(read_bio, r10_bio->sector - bio->bi_sector,
+                bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
                         max_sectors);
                r10_bio->devs[slot].bio = read_bio;
                r10_bio->devs[slot].rdev = rdev;
-                read_bio->bi_sector = r10_bio->devs[slot].addr +
+                read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
                        choose_data_offset(r10_bio, rdev);
                read_bio->bi_bdev = rdev->bdev;
                read_bio->bi_end_io = raid10_end_read_request;
@@ -1319,15 +1256,15 @@ read_again:
                        /* Could not read all from this device, so we will
                         * need another r10_bio.
                         */
-                        sectors_handled = (r10_bio->sectors + max_sectors
+                        sectors_handled = (r10_bio->sector + max_sectors
-                                           - bio->bi_sector);
+                                           - bio->bi_iter.bi_sector);
                        r10_bio->sectors = max_sectors;
                        spin_lock_irq(&conf->device_lock);
                        if (bio->bi_phys_segments == 0)
                                bio->bi_phys_segments = 2;
                        else
                                bio->bi_phys_segments++;
-                        spin_unlock(&conf->device_lock);
+                        spin_unlock_irq(&conf->device_lock);
                        /* Cannot call generic_make_request directly
                         * as that will be queued in __generic_make_request
                         * and subsequent mempool_alloc might block
@@ -1341,7 +1278,8 @@ read_again:
                        r10_bio->sectors = bio_sectors(bio) - sectors_handled;
                        r10_bio->state = 0;
                        r10_bio->mddev = mddev;
-                        r10_bio->sector = bio->bi_sector + sectors_handled;
+                        r10_bio->sector = bio->bi_iter.bi_sector +
+                                sectors_handled;
                        goto read_again;
                } else
                        generic_make_request(read_bio);
@@ -1499,7 +1437,8 @@ retry_write:
                        bio->bi_phys_segments++;
                spin_unlock_irq(&conf->device_lock);
        }
-        sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;
+        sectors_handled = r10_bio->sector + max_sectors -
+                bio->bi_iter.bi_sector;
        atomic_set(&r10_bio->remaining, 1);
        bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
@@ -1510,11 +1449,11 @@ retry_write:
                if (r10_bio->devs[i].bio) {
                        struct md_rdev *rdev = conf->mirrors[d].rdev;
                        mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-                        bio_trim(mbio, r10_bio->sector - bio->bi_sector,
+                        bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
                                 max_sectors);
                        r10_bio->devs[i].bio = mbio;
-                        mbio->bi_sector = (r10_bio->devs[i].addr+
+                        mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+
                                           choose_data_offset(r10_bio,
                                                              rdev));
                        mbio->bi_bdev = rdev->bdev;
@@ -1553,11 +1492,11 @@ retry_write:
                                rdev = conf->mirrors[d].rdev;
                        }
                        mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-                        bio_trim(mbio, r10_bio->sector - bio->bi_sector,
+                        bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector,
                                 max_sectors);
                        r10_bio->devs[i].repl_bio = mbio;
-                        mbio->bi_sector = (r10_bio->devs[i].addr +
+                        mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr +
                                           choose_data_offset(
                                                   r10_bio, rdev));
                        mbio->bi_bdev = rdev->bdev;
@@ -1591,11 +1530,57 @@ retry_write:
                r10_bio->sectors = bio_sectors(bio) - sectors_handled;
                r10_bio->mddev = mddev;
-                r10_bio->sector = bio->bi_sector + sectors_handled;
+                r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
                r10_bio->state = 0;
                goto retry_write;
        }
        one_write_done(r10_bio);
+}
+static void make_request(struct mddev *mddev, struct bio *bio)
+{
+        struct r10conf *conf = mddev->private;
+        sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
+        int chunk_sects = chunk_mask + 1;
+        struct bio *split;
+        if (unlikely(bio->bi_rw & REQ_FLUSH)) {
+                md_flush_request(mddev, bio);
+                return;
+        }
+        md_write_start(mddev, bio);
+        /*
+         * Register the new request and wait if the reconstruction
+         * thread has put up a bar for new requests.
+         * Continue immediately if no resync is active currently.
+         */
+        wait_barrier(conf);
+        do {
+                /*
+                 * If this request crosses a chunk boundary, we need to split
+                 * it.
+                 */
+                if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
+                             bio_sectors(bio) > chunk_sects
+                             && (conf->geo.near_copies < conf->geo.raid_disks
+                                 || conf->prev.near_copies <
+                                 conf->prev.raid_disks))) {
+                        split = bio_split(bio, chunk_sects -
+                                          (bio->bi_iter.bi_sector &
+                                           (chunk_sects - 1)),
+                                          GFP_NOIO, fs_bio_set);
+                        bio_chain(split, bio);
+                } else {
+                        split = bio;
+                }
+                __make_request(mddev, split);
+        } while (split != bio);
        /* In case raid10d snuck in to freeze_array */
        wake_up(&conf->wait_barrier);
@@ -2124,10 +2109,10 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
                bio_reset(tbio);
                tbio->bi_vcnt = vcnt;
-                tbio->bi_size = r10_bio->sectors << 9;
+                tbio->bi_iter.bi_size = r10_bio->sectors << 9;
                tbio->bi_rw = WRITE;
                tbio->bi_private = r10_bio;
-                tbio->bi_sector = r10_bio->devs[i].addr;
+                tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
                for (j=0; j < vcnt ; j++) {
                        tbio->bi_io_vec[j].bv_offset = 0;
@@ -2144,7 +2129,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
                atomic_inc(&r10_bio->remaining);
                md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
-                tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
+                tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
                tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
                generic_make_request(tbio);
        }
@@ -2614,8 +2599,8 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
                        sectors = sect_to_write;
                /* Write at 'sector' for 'sectors' */
                wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-                bio_trim(wbio, sector - bio->bi_sector, sectors);
+                bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
-                wbio->bi_sector = (r10_bio->devs[i].addr+
+                wbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+
                                   choose_data_offset(r10_bio, rdev) +
                                   (sector - r10_bio->sector));
                wbio->bi_bdev = rdev->bdev;
@@ -2687,10 +2672,10 @@ read_more:
                (unsigned long long)r10_bio->sector);
        bio = bio_clone_mddev(r10_bio->master_bio,
                              GFP_NOIO, mddev);
-        bio_trim(bio, r10_bio->sector - bio->bi_sector, max_sectors);
+        bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
        r10_bio->devs[slot].bio = bio;
        r10_bio->devs[slot].rdev = rdev;
-        bio->bi_sector = r10_bio->devs[slot].addr
+        bio->bi_iter.bi_sector = r10_bio->devs[slot].addr
                + choose_data_offset(r10_bio, rdev);
        bio->bi_bdev = rdev->bdev;
        bio->bi_rw = READ | do_sync;
@@ -2701,7 +2686,7 @@ read_more:
                struct bio *mbio = r10_bio->master_bio;
                int sectors_handled =
                        r10_bio->sector + max_sectors
-                        - mbio->bi_sector;
+                        - mbio->bi_iter.bi_sector;
                r10_bio->sectors = max_sectors;
                spin_lock_irq(&conf->device_lock);
                if (mbio->bi_phys_segments == 0)
@@ -2719,7 +2704,7 @@ read_more:
                set_bit(R10BIO_ReadError,
                        &r10_bio->state);
                r10_bio->mddev = mddev;
-                r10_bio->sector = mbio->bi_sector
+                r10_bio->sector = mbio->bi_iter.bi_sector
                        + sectors_handled;
                goto read_more;
@@ -3157,7 +3142,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
                                bio->bi_end_io = end_sync_read;
                                bio->bi_rw = READ;
                                from_addr = r10_bio->devs[j].addr;
-                                bio->bi_sector = from_addr + rdev->data_offset;
+                                bio->bi_iter.bi_sector = from_addr +
+                                        rdev->data_offset;
                                bio->bi_bdev = rdev->bdev;
                                atomic_inc(&rdev->nr_pending);
                                /* and we write to 'i' (if not in_sync) */
@@ -3181,7 +3167,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
                                        bio->bi_private = r10_bio;
                                        bio->bi_end_io = end_sync_write;
                                        bio->bi_rw = WRITE;
-                                        bio->bi_sector = to_addr
+                                        bio->bi_iter.bi_sector = to_addr
                                                + rdev->data_offset;
                                        bio->bi_bdev = rdev->bdev;
                                        atomic_inc(&r10_bio->remaining);
@@ -3210,7 +3196,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
                                bio->bi_private = r10_bio;
                                bio->bi_end_io = end_sync_write;
                                bio->bi_rw = WRITE;
-                                bio->bi_sector = to_addr + rdev->data_offset;
+                                bio->bi_iter.bi_sector = to_addr +
+                                        rdev->data_offset;
                                bio->bi_bdev = rdev->bdev;
                                atomic_inc(&r10_bio->remaining);
                                break;
@@ -3218,10 +3205,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
                        if (j == conf->copies) {
                                /* Cannot recover, so abort the recovery or
                                 * record a bad block */
-                                put_buf(r10_bio);
-                                if (rb2)
-                                        atomic_dec(&rb2->remaining);
-                                r10_bio = rb2;
                                if (any_working) {
                                        /* problem is that there are bad blocks
                                         * on other device(s)
@@ -3253,6 +3236,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
                                        mirror->recovery_disabled
                                                = mddev->recovery_disabled;
                                }
+                                put_buf(r10_bio);
+                                if (rb2)
+                                        atomic_dec(&rb2->remaining);
+                                r10_bio = rb2;
                                break;
                        }
                }
@@ -3328,7 +3315,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
                        bio->bi_private = r10_bio;
                        bio->bi_end_io = end_sync_read;
                        bio->bi_rw = READ;
-                        bio->bi_sector = sector +
+                        bio->bi_iter.bi_sector = sector +
                                conf->mirrors[d].rdev->data_offset;
                        bio->bi_bdev = conf->mirrors[d].rdev->bdev;
                        count++;
@@ -3350,7 +3337,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
                        bio->bi_private = r10_bio;
                        bio->bi_end_io = end_sync_write;
                        bio->bi_rw = WRITE;
-                        bio->bi_sector = sector +
+                        bio->bi_iter.bi_sector = sector +
                                conf->mirrors[d].replacement->data_offset;
                        bio->bi_bdev = conf->mirrors[d].replacement->bdev;
                        count++;
@@ -3397,7 +3384,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
                             bio2 = bio2->bi_next) {
                                /* remove last page from this bio */
                                bio2->bi_vcnt--;
-                                bio2->bi_size -= len;
+                                bio2->bi_iter.bi_size -= len;
                                bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
                        }
                        goto bio_full;
@@ -3747,7 +3734,8 @@ static int run(struct mddev *mddev)
                    !test_bit(In_sync, &disk->rdev->flags)) {
                        disk->head_position = 0;
                        mddev->degraded++;
-                        if (disk->rdev)
+                        if (disk->rdev &&
+                            disk->rdev->saved_raid_disk < 0)
                                conf->fullsync = 1;
                }
                disk->recovery_disabled = mddev->recovery_disabled - 1;
@@ -4417,7 +4405,7 @@ read_more:
        read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
        read_bio->bi_bdev = rdev->bdev;
-        read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
+        read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
                               + rdev->data_offset);
        read_bio->bi_private = r10_bio;
        read_bio->bi_end_io = end_sync_read;
@@ -4425,7 +4413,7 @@ read_more:
        read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
        read_bio->bi_flags |= 1 << BIO_UPTODATE;
        read_bio->bi_vcnt = 0;
-        read_bio->bi_size = 0;
+        read_bio->bi_iter.bi_size = 0;
        r10_bio->master_bio = read_bio;
        r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
@@ -4451,7 +4439,8 @@ read_more:
                bio_reset(b);
                b->bi_bdev = rdev2->bdev;
-                b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;
+                b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
+                        rdev2->new_data_offset;
                b->bi_private = r10_bio;
                b->bi_end_io = end_reshape_write;
                b->bi_rw = WRITE;
@@ -4478,7 +4467,7 @@ read_more:
                             bio2 = bio2->bi_next) {
                                /* Remove last page from this bio */
                                bio2->bi_vcnt--;
-                                bio2->bi_size -= len;
+                                bio2->bi_iter.bi_size -= len;
                                bio2->bi_flags &= ~(1<<BIO_SEG_VALID);
                        }
                        goto bio_full;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index cc055da02e2a..16f5c21963db 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -133,7 +133,7 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
 static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
 {
        int sectors = bio_sectors(bio);
-        if (bio->bi_sector + sectors < sector + STRIPE_SECTORS)
+        if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS)
                return bio->bi_next;
        else
                return NULL;
@@ -225,7 +225,7 @@ static void return_io(struct bio *return_bi)
                return_bi = bi->bi_next;
                bi->bi_next = NULL;
-                bi->bi_size = 0;
+                bi->bi_iter.bi_size = 0;
                trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
                                         bi, 0);
                bio_endio(bi, 0);
@@ -675,8 +675,10 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
                                         || !conf->inactive_blocked),
                                        *(conf->hash_locks + hash));
                                conf->inactive_blocked = 0;
-                        } else
+                        } else {
                                init_stripe(sh, sector, previous);
+                                atomic_inc(&sh->count);
+                        }
                } else {
                        spin_lock(&conf->device_lock);
                        if (atomic_read(&sh->count)) {
@@ -687,20 +689,19 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
                        } else {
                                if (!test_bit(STRIPE_HANDLE, &sh->state))
                                        atomic_inc(&conf->active_stripes);
-                                BUG_ON(list_empty(&sh->lru));
+                                BUG_ON(list_empty(&sh->lru) &&
+                                       !test_bit(STRIPE_EXPANDING, &sh->state));
                                list_del_init(&sh->lru);
                                if (sh->group) {
                                        sh->group->stripes_cnt--;
                                        sh->group = NULL;
                                }
                        }
+                        atomic_inc(&sh->count);
                        spin_unlock(&conf->device_lock);
                }
        } while (sh == NULL);
-        if (sh)
-                atomic_inc(&sh->count);
        spin_unlock_irq(conf->hash_locks + hash);
        return sh;
 }
@@ -851,10 +852,10 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                                bi->bi_rw, i);
                        atomic_inc(&sh->count);
                        if (use_new_offset(conf, sh))
-                                bi->bi_sector = (sh->sector
+                                bi->bi_iter.bi_sector = (sh->sector
                                                 + rdev->new_data_offset);
                        else
-                                bi->bi_sector = (sh->sector
+                                bi->bi_iter.bi_sector = (sh->sector
                                                 + rdev->data_offset);
                        if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
                                bi->bi_rw |= REQ_NOMERGE;
@@ -862,7 +863,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                        bi->bi_vcnt = 1;
                        bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
                        bi->bi_io_vec[0].bv_offset = 0;
-                        bi->bi_size = STRIPE_SIZE;
+                        bi->bi_iter.bi_size = STRIPE_SIZE;
                        /*
                         * If this is discard request, set bi_vcnt 0. We don't
                         * want to confuse SCSI because SCSI will replace payload
@@ -898,15 +899,15 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                                rbi->bi_rw, i);
                        atomic_inc(&sh->count);
                        if (use_new_offset(conf, sh))
-                                rbi->bi_sector = (sh->sector
+                                rbi->bi_iter.bi_sector = (sh->sector
                                                  + rrdev->new_data_offset);
                        else
-                                rbi->bi_sector = (sh->sector
+                                rbi->bi_iter.bi_sector = (sh->sector
                                                  + rrdev->data_offset);
                        rbi->bi_vcnt = 1;
                        rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
                        rbi->bi_io_vec[0].bv_offset = 0;
-                        rbi->bi_size = STRIPE_SIZE;
+                        rbi->bi_iter.bi_size = STRIPE_SIZE;
                        /*
                         * If this is discard request, set bi_vcnt 0. We don't
                         * want to confuse SCSI because SCSI will replace payload
@@ -934,24 +935,24 @@ static struct dma_async_tx_descriptor *
 async_copy_data(int frombio, struct bio *bio, struct page *page,
        sector_t sector, struct dma_async_tx_descriptor *tx)
 {
-        struct bio_vec *bvl;
+        struct bio_vec bvl;
+        struct bvec_iter iter;
        struct page *bio_page;
-        int i;
        int page_offset;
        struct async_submit_ctl submit;
        enum async_tx_flags flags = 0;
-        if (bio->bi_sector >= sector)
+        if (bio->bi_iter.bi_sector >= sector)
-                page_offset = (signed)(bio->bi_sector - sector) * 512;
+                page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
        else
-                page_offset = (signed)(sector - bio->bi_sector) * -512;
+                page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512;
        if (frombio)
                flags |= ASYNC_TX_FENCE;
        init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
-        bio_for_each_segment(bvl, bio, i) {
+        bio_for_each_segment(bvl, bio, iter) {
-                int len = bvl->bv_len;
+                int len = bvl.bv_len;
                int clen;
                int b_offset = 0;
@@ -967,8 +968,8 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
                        clen = len;
                if (clen > 0) {
-                        b_offset += bvl->bv_offset;
+                        b_offset += bvl.bv_offset;
-                        bio_page = bvl->bv_page;
+                        bio_page = bvl.bv_page;
                        if (frombio)
                                tx = async_memcpy(page, bio_page, page_offset,
                                                  b_offset, clen, &submit);
@@ -1011,7 +1012,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
                        BUG_ON(!dev->read);
                        rbi = dev->read;
                        dev->read = NULL;
-                        while (rbi && rbi->bi_sector <
+                        while (rbi && rbi->bi_iter.bi_sector <
                                dev->sector + STRIPE_SECTORS) {
                                rbi2 = r5_next_bio(rbi, dev->sector);
                                if (!raid5_dec_bi_active_stripes(rbi)) {
@@ -1047,7 +1048,7 @@ static void ops_run_biofill(struct stripe_head *sh)
                        dev->read = rbi = dev->toread;
                        dev->toread = NULL;
                        spin_unlock_irq(&sh->stripe_lock);
-                        while (rbi && rbi->bi_sector <
+                        while (rbi && rbi->bi_iter.bi_sector <
                                dev->sector + STRIPE_SECTORS) {
                                tx = async_copy_data(0, rbi, dev->page,
                                        dev->sector, tx);
@@ -1389,7 +1390,7 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
                        wbi = dev->written = chosen;
                        spin_unlock_irq(&sh->stripe_lock);
-                        while (wbi && wbi->bi_sector <
+                        while (wbi && wbi->bi_iter.bi_sector <
                                dev->sector + STRIPE_SECTORS) {
                                if (wbi->bi_rw & REQ_FUA)
                                        set_bit(R5_WantFUA, &dev->flags);
@@ -2110,6 +2111,7 @@ static void raid5_end_write_request(struct bio *bi, int error)
                        set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
        } else {
                if (!uptodate) {
+                        set_bit(STRIPE_DEGRADED, &sh->state);
                        set_bit(WriteErrorSeen, &rdev->flags);
                        set_bit(R5_WriteError, &sh->dev[i].flags);
                        if (!test_and_set_bit(WantReplacement, &rdev->flags))
@@ -2613,7 +2615,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
        int firstwrite=0;
        pr_debug("adding bi b#%llu to stripe s#%llu\n",
-                (unsigned long long)bi->bi_sector,
+                (unsigned long long)bi->bi_iter.bi_sector,
                (unsigned long long)sh->sector);
        /*
@@ -2631,12 +2633,12 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
                        firstwrite = 1;
        } else
                bip = &sh->dev[dd_idx].toread;
-        while (*bip && (*bip)->bi_sector < bi->bi_sector) {
+        while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
-                if (bio_end_sector(*bip) > bi->bi_sector)
+                if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
                        goto overlap;
                bip = & (*bip)->bi_next;
        }
-        if (*bip && (*bip)->bi_sector < bio_end_sector(bi))
+        if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
                goto overlap;
        BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
@@ -2650,7 +2652,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
                sector_t sector = sh->dev[dd_idx].sector;
                for (bi=sh->dev[dd_idx].towrite;
                     sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
-                             bi && bi->bi_sector <= sector;
+                             bi && bi->bi_iter.bi_sector <= sector;
                     bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
                        if (bio_end_sector(bi) >= sector)
                                sector = bio_end_sector(bi);
@@ -2660,7 +2662,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
        }
        pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
-                (unsigned long long)(*bip)->bi_sector,
+                (unsigned long long)(*bip)->bi_iter.bi_sector,
                (unsigned long long)sh->sector, dd_idx);
        spin_unlock_irq(&sh->stripe_lock);
@@ -2735,7 +2737,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
                if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
                        wake_up(&conf->wait_for_overlap);
-                while (bi && bi->bi_sector <
+                while (bi && bi->bi_iter.bi_sector <
                        sh->dev[i].sector + STRIPE_SECTORS) {
                        struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
                        clear_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -2754,7 +2756,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
                bi = sh->dev[i].written;
                sh->dev[i].written = NULL;
                if (bi) bitmap_end = 1;
-                while (bi && bi->bi_sector <
+                while (bi && bi->bi_iter.bi_sector <
                       sh->dev[i].sector + STRIPE_SECTORS) {
                        struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
                        clear_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -2778,7 +2780,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
                        spin_unlock_irq(&sh->stripe_lock);
                        if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
                                wake_up(&conf->wait_for_overlap);
-                        while (bi && bi->bi_sector <
+                        while (bi && bi->bi_iter.bi_sector <
                               sh->dev[i].sector + STRIPE_SECTORS) {
                                struct bio *nextbi =
                                        r5_next_bio(bi, sh->dev[i].sector);
@@ -3002,7 +3004,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
                                        clear_bit(R5_UPTODATE, &dev->flags);
                                wbi = dev->written;
                                dev->written = NULL;
-                                while (wbi && wbi->bi_sector <
+                                while (wbi && wbi->bi_iter.bi_sector <
                                        dev->sector + STRIPE_SECTORS) {
                                        wbi2 = r5_next_bio(wbi, dev->sector);
                                        if (!raid5_dec_bi_active_stripes(wbi)) {
@@ -3608,7 +3610,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
                         */
                        set_bit(R5_Insync, &dev->flags);
-                if (rdev && test_bit(R5_WriteError, &dev->flags)) {
+                if (test_bit(R5_WriteError, &dev->flags)) {
                        /* This flag does not apply to '.replacement'
                         * only to .rdev, so make sure to check that*/
                        struct md_rdev *rdev2 = rcu_dereference(
@@ -3621,7 +3623,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
                        } else
                                clear_bit(R5_WriteError, &dev->flags);
                }
-                if (rdev && test_bit(R5_MadeGood, &dev->flags)) {
+                if (test_bit(R5_MadeGood, &dev->flags)) {
                        /* This flag does not apply to '.replacement'
                         * only to .rdev, so make sure to check that*/
                        struct md_rdev *rdev2 = rcu_dereference(
@@ -4094,7 +4096,7 @@ static int raid5_mergeable_bvec(struct request_queue *q,
 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
 {
-        sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+        sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev);
        unsigned int chunk_sectors = mddev->chunk_sectors;
        unsigned int bio_sectors = bio_sectors(bio);
@@ -4231,9 +4233,9 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
        /*
         *      compute position
         */
-        align_bi->bi_sector =  raid5_compute_sector(conf, raid_bio->bi_sector,
+        align_bi->bi_iter.bi_sector =
-                                                    0,
+                raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector,
-                                                    &dd_idx, NULL);
+                                     0, &dd_idx, NULL);
        end_sector = bio_end_sector(align_bi);
        rcu_read_lock();
@@ -4258,7 +4260,8 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
                align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
                if (!bio_fits_rdev(align_bi) ||
-                    is_badblock(rdev, align_bi->bi_sector, bio_sectors(align_bi),
+                    is_badblock(rdev, align_bi->bi_iter.bi_sector,
+                                bio_sectors(align_bi),
                                &first_bad, &bad_sectors)) {
                        /* too big in some way, or has a known bad block */
                        bio_put(align_bi);
@@ -4267,7 +4270,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
                }
                /* No reshape active, so we can trust rdev->data_offset */
-                align_bi->bi_sector += rdev->data_offset;
+                align_bi->bi_iter.bi_sector += rdev->data_offset;
                spin_lock_irq(&conf->device_lock);
                wait_event_lock_irq(conf->wait_for_stripe,
@@ -4279,7 +4282,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
                if (mddev->gendisk)
                        trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev),
                                              align_bi, disk_devt(mddev->gendisk),
-                                              raid_bio->bi_sector);
+                                              raid_bio->bi_iter.bi_sector);
                generic_make_request(align_bi);
                return 1;
        } else {
@@ -4462,8 +4465,8 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
                /* Skip discard while reshape is happening */
                return;
-        logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+        logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
-        last_sector = bi->bi_sector + (bi->bi_size>>9);
+        last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
        bi->bi_next = NULL;
        bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
@@ -4567,7 +4570,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
                return;
        }
-        logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+        logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
        last_sector = bio_end_sector(bi);
        bi->bi_next = NULL;
        bi->bi_phys_segments = 1;       /* over-loaded to count active stripes */
@@ -5051,7 +5054,8 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
        int remaining;
        int handled = 0;
-        logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+        logical_sector = raid_bio->bi_iter.bi_sector &
+                ~((sector_t)STRIPE_SECTORS-1);
        sector = raid5_compute_sector(conf, logical_sector,
                                      0, &dd_idx, NULL);
        last_sector = bio_end_sector(raid_bio);
@@ -5510,23 +5514,43 @@ raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
        return sectors * (raid_disks - conf->max_degraded);
 }
+static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
+{
+        safe_put_page(percpu->spare_page);
+        kfree(percpu->scribble);
+        percpu->spare_page = NULL;
+        percpu->scribble = NULL;
+}
+static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
+{
+        if (conf->level == 6 && !percpu->spare_page)
+                percpu->spare_page = alloc_page(GFP_KERNEL);
+        if (!percpu->scribble)
+                percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
+        if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) {
+                free_scratch_buffer(conf, percpu);
+                return -ENOMEM;
+        }
+        return 0;
+}
 static void raid5_free_percpu(struct r5conf *conf)
 {
-        struct raid5_percpu *percpu;
        unsigned long cpu;
        if (!conf->percpu)
                return;
-        get_online_cpus();
-        for_each_possible_cpu(cpu) {
-                percpu = per_cpu_ptr(conf->percpu, cpu);
-                safe_put_page(percpu->spare_page);
-                kfree(percpu->scribble);
-        }
 #ifdef CONFIG_HOTPLUG_CPU
        unregister_cpu_notifier(&conf->cpu_notify);
 #endif
+        get_online_cpus();
+        for_each_possible_cpu(cpu)
+                free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
        put_online_cpus();
        free_percpu(conf->percpu);
@@ -5553,15 +5577,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-                if (conf->level == 6 && !percpu->spare_page)
+                if (alloc_scratch_buffer(conf, percpu)) {
-                        percpu->spare_page = alloc_page(GFP_KERNEL);
-                if (!percpu->scribble)
-                        percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
-                if (!percpu->scribble ||
-                    (conf->level == 6 && !percpu->spare_page)) {
-                        safe_put_page(percpu->spare_page);
-                        kfree(percpu->scribble);
                        pr_err("%s: failed memory allocation for cpu%ld\n",
                               __func__, cpu);
                        return notifier_from_errno(-ENOMEM);
@@ -5569,10 +5585,7 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
                break;
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
-                safe_put_page(percpu->spare_page);
+                free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
-                kfree(percpu->scribble);
-                percpu->spare_page = NULL;
-                percpu->scribble = NULL;
                break;
        default:
                break;
@@ -5584,40 +5597,29 @@ static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
 static int raid5_alloc_percpu(struct r5conf *conf)
 {
        unsigned long cpu;
-        struct page *spare_page;
+        int err = 0;
-        struct raid5_percpu __percpu *allcpus;
-        void *scribble;
-        int err;
-        allcpus = alloc_percpu(struct raid5_percpu);
+        conf->percpu = alloc_percpu(struct raid5_percpu);
-        if (!allcpus)
+        if (!conf->percpu)
                return -ENOMEM;
-        conf->percpu = allcpus;
+#ifdef CONFIG_HOTPLUG_CPU
+        conf->cpu_notify.notifier_call = raid456_cpu_notify;
+        conf->cpu_notify.priority = 0;
+        err = register_cpu_notifier(&conf->cpu_notify);
+        if (err)
+                return err;
+#endif
        get_online_cpus();
-        err = 0;
        for_each_present_cpu(cpu) {
-                if (conf->level == 6) {
+                err = alloc_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
-                        spare_page = alloc_page(GFP_KERNEL);
+                if (err) {
-                        if (!spare_page) {
+                        pr_err("%s: failed memory allocation for cpu%ld\n",
-                                err = -ENOMEM;
+                               __func__, cpu);
-                                break;
-                        }
-                        per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
-                }
-                scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
-                if (!scribble) {
-                        err = -ENOMEM;
                        break;
                }
-                per_cpu_ptr(conf->percpu, cpu)->scribble = scribble;
        }
-#ifdef CONFIG_HOTPLUG_CPU
-        conf->cpu_notify.notifier_call = raid456_cpu_notify;
-        conf->cpu_notify.priority = 0;
-        if (err == 0)
-                err = register_cpu_notifier(&conf->cpu_notify);
-#endif
        put_online_cpus();
        return err;
@@ -6099,6 +6101,7 @@ static int run(struct mddev *mddev)
                blk_queue_io_min(mddev->queue, chunk_size);
                blk_queue_io_opt(mddev->queue, chunk_size *
                                 (conf->raid_disks - conf->max_degraded));
+                mddev->queue->limits.raid_partial_stripes_expensive = 1;
                /*
                 * We can only discard a whole stripe. It doesn't make sense to
                 * discard data disk but write parity disk