Merge branch 'for-3.14/drivers' of git://git.kernel.dk/linux-block

Pull block IO driver changes from Jens Axboe: - bcache update from Kent Overstreet. - two bcache fixes from Nicholas Swenson. - cciss pci init error fix from Andrew. - underflow fix in the parallel IDE pg_write code from Dan Carpenter. I'm sure the 1 (or 0) users of that are now happy. - two PCI related fixes for sx8 from Jingoo Han. - floppy init fix for first block read from Jiri Kosina. - pktcdvd error return miss fix from Julia Lawall. - removal of IRQF_SHARED from the SEGA Dreamcast CD-ROM code from Michael Opdenacker. - comment typo fix for the loop driver from Olaf Hering. - potential oops fix for null_blk from Raghavendra K T. - two fixes from Sam Bradshaw (Micron) for the mtip32xx driver, fixing an OOM problem and a problem with handling security locked conditions * 'for-3.14/drivers' of git://git.kernel.dk/linux-block: (47 commits) mg_disk: Spelling s/finised/finished/ null_blk: Null pointer deference problem in alloc_page_buffers mtip32xx: Correctly handle security locked condition mtip32xx: Make SGL container per-command to eliminate high order dma allocation drivers/block/loop.c: fix comment typo in loop_config_discard drivers/block/cciss.c:cciss_init_one(): use proper errnos drivers/block/paride/pg.c: underflow bug in pg_write() drivers/block/sx8.c: remove unnecessary pci_set_drvdata() drivers/block/sx8.c: use module_pci_driver() floppy: bail out in open() if drive is not responding to block0 read bcache: Fix auxiliary search trees for key size > cacheline size bcache: Don't return -EINTR when insert finished bcache: Improve bucket_prio() calculation bcache: Add bch_bkey_equal_header() bcache: update bch_bkey_try_merge bcache: Move insert_fixup() to btree_keys_ops bcache: Convert sorting to btree_keys bcache: Convert debug code to btree_keys bcache: Convert btree_iter to struct btree_keys bcache: Refactor bset_tree sysfs stats ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2014-01-30 14:40:10 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-01-30 14:40:10 -0500
commit: 53d8ab29f8f6d67e37857b68189b38fa3d87dd8e (patch)
tree: 3c770b58f0404c67b1b084f626dcafa8464c7512 /drivers/md/bcache
parent: f568849edac8611d603e00bd6cbbcfea09395ae6 (diff)
parent: 14424be4dbfa127001ad623869f7ee4c7635e991 (diff)
21 files changed, 2212 insertions, 1755 deletions
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
index 0e9c82523be6..c488b846f831 100644
--- a/drivers/md/bcache/Makefile
+++ b/drivers/md/bcache/Makefile
@@ -1,7 +1,8 @@
 obj-$(CONFIG_BCACHE)    += bcache.o
-bcache-y                := alloc.o btree.o bset.o io.o journal.o writeback.o\
+bcache-y                := alloc.o bset.o btree.o closure.o debug.o extents.o\
-        movinggc.o request.o super.o sysfs.o debug.o util.o trace.o stats.o closure.o
+        io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
+        util.o writeback.o
 CFLAGS_request.o        += -Iblock
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 4c9852d92b0a..c0d37d082443 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -132,10 +132,16 @@ bool bch_bucket_add_unused(struct cache *ca, struct bucket *b)
 {
        BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b));
-        if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] &&
+        if (CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) {
-            CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO)
+                unsigned i;
-                return false;
+                for (i = 0; i < RESERVE_NONE; i++)
+                        if (!fifo_full(&ca->free[i]))
+                                goto add;
+                return false;
+        }
+add:
        b->prio = 0;
        if (can_inc_bucket_gen(b) &&
@@ -162,8 +168,21 @@ static void invalidate_one_bucket(struct cache *ca, struct bucket *b)
        fifo_push(&ca->free_inc, b - ca->buckets);
 }
-#define bucket_prio(b)                          \
+/*
-        (((unsigned) (b->prio - ca->set->min_prio)) * GC_SECTORS_USED(b))
+ * Determines what order we're going to reuse buckets, smallest bucket_prio()
+ * first: we also take into account the number of sectors of live data in that
+ * bucket, and in order for that multiply to make sense we have to scale bucket
+ *
+ * Thus, we scale the bucket priorities so that the bucket with the smallest
+ * prio is worth 1/8th of what INITIAL_PRIO is worth.
+ */
+#define bucket_prio(b)                                                  \
+({                                                                      \
+        unsigned min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8;     \
+                                                                        \
+        (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b);  \
+})
 #define bucket_max_cmp(l, r)    (bucket_prio(l) < bucket_prio(r))
 #define bucket_min_cmp(l, r)    (bucket_prio(l) > bucket_prio(r))
@@ -304,6 +323,21 @@ do {									\
        __set_current_state(TASK_RUNNING);                              \
 } while (0)
+static int bch_allocator_push(struct cache *ca, long bucket)
+{
+        unsigned i;
+        /* Prios/gens are actually the most important reserve */
+        if (fifo_push(&ca->free[RESERVE_PRIO], bucket))
+                return true;
+        for (i = 0; i < RESERVE_NR; i++)
+                if (fifo_push(&ca->free[i], bucket))
+                        return true;
+        return false;
+}
 static int bch_allocator_thread(void *arg)
 {
        struct cache *ca = arg;
@@ -336,9 +370,7 @@ static int bch_allocator_thread(void *arg)
                                mutex_lock(&ca->set->bucket_lock);
                        }
-                        allocator_wait(ca, !fifo_full(&ca->free));
+                        allocator_wait(ca, bch_allocator_push(ca, bucket));
-                        fifo_push(&ca->free, bucket);
                        wake_up(&ca->set->bucket_wait);
                }
@@ -365,34 +397,29 @@ static int bch_allocator_thread(void *arg)
        }
 }
-long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait)
+long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
 {
        DEFINE_WAIT(w);
        struct bucket *b;
        long r;
        /* fastpath */
-        if (fifo_used(&ca->free) > ca->watermark[watermark]) {
+        if (fifo_pop(&ca->free[RESERVE_NONE], r) ||
-                fifo_pop(&ca->free, r);
+            fifo_pop(&ca->free[reserve], r))
                goto out;
-        }
        if (!wait)
                return -1;
-        while (1) {
+        do {
-                if (fifo_used(&ca->free) > ca->watermark[watermark]) {
-                        fifo_pop(&ca->free, r);
-                        break;
-                }
                prepare_to_wait(&ca->set->bucket_wait, &w,
                                TASK_UNINTERRUPTIBLE);
                mutex_unlock(&ca->set->bucket_lock);
                schedule();
                mutex_lock(&ca->set->bucket_lock);
-        }
+        } while (!fifo_pop(&ca->free[RESERVE_NONE], r) &&
+                 !fifo_pop(&ca->free[reserve], r));
        finish_wait(&ca->set->bucket_wait, &w);
 out:
@@ -401,12 +428,14 @@ out:
        if (expensive_debug_checks(ca->set)) {
                size_t iter;
                long i;
+                unsigned j;
                for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
                        BUG_ON(ca->prio_buckets[iter] == (uint64_t) r);
-                fifo_for_each(i, &ca->free, iter)
+                for (j = 0; j < RESERVE_NR; j++)
-                        BUG_ON(i == r);
+                        fifo_for_each(i, &ca->free[j], iter)
+                                BUG_ON(i == r);
                fifo_for_each(i, &ca->free_inc, iter)
                        BUG_ON(i == r);
                fifo_for_each(i, &ca->unused, iter)
@@ -419,7 +448,7 @@ out:
        SET_GC_SECTORS_USED(b, ca->sb.bucket_size);
-        if (watermark <= WATERMARK_METADATA) {
+        if (reserve <= RESERVE_PRIO) {
                SET_GC_MARK(b, GC_MARK_METADATA);
                SET_GC_MOVE(b, 0);
                b->prio = BTREE_PRIO;
@@ -445,7 +474,7 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k)
        }
 }
-int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
+int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
                           struct bkey *k, int n, bool wait)
 {
        int i;
@@ -459,7 +488,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
        for (i = 0; i < n; i++) {
                struct cache *ca = c->cache_by_alloc[i];
-                long b = bch_bucket_alloc(ca, watermark, wait);
+                long b = bch_bucket_alloc(ca, reserve, wait);
                if (b == -1)
                        goto err;
@@ -478,12 +507,12 @@ err:
        return -1;
 }
-int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
+int bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
                         struct bkey *k, int n, bool wait)
 {
        int ret;
        mutex_lock(&c->bucket_lock);
-        ret = __bch_bucket_alloc_set(c, watermark, k, n, wait);
+        ret = __bch_bucket_alloc_set(c, reserve, k, n, wait);
        mutex_unlock(&c->bucket_lock);
        return ret;
 }
@@ -573,8 +602,8 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors,
        while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) {
                unsigned watermark = write_prio
-                        ? WATERMARK_MOVINGGC
+                        ? RESERVE_MOVINGGC
-                        : WATERMARK_NONE;
+                        : RESERVE_NONE;
                spin_unlock(&c->data_bucket_lock);
@@ -689,7 +718,7 @@ int bch_cache_allocator_init(struct cache *ca)
         * Then 8 for btree allocations
         * Then half for the moving garbage collector
         */
+#if 0
        ca->watermark[WATERMARK_PRIO] = 0;
        ca->watermark[WATERMARK_METADATA] = prio_buckets(ca);
@@ -699,6 +728,6 @@ int bch_cache_allocator_init(struct cache *ca)
        ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
                ca->watermark[WATERMARK_MOVINGGC];
+#endif
        return 0;
 }
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index dbdbca5a9591..0c707e4f4eaf 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -187,6 +187,7 @@
 #include <linux/types.h>
 #include <linux/workqueue.h>
+#include "bset.h"
 #include "util.h"
 #include "closure.h"
@@ -309,7 +310,8 @@ struct cached_dev {
        struct cache_sb         sb;
        struct bio              sb_bio;
        struct bio_vec          sb_bv[1];
-        struct closure_with_waitlist sb_write;
+        struct closure          sb_write;
+        struct semaphore        sb_write_mutex;
        /* Refcount on the cache set. Always nonzero when we're caching. */
        atomic_t                count;
@@ -382,12 +384,12 @@ struct cached_dev {
        unsigned                writeback_rate_p_term_inverse;
 };
-enum alloc_watermarks {
+enum alloc_reserve {
-        WATERMARK_PRIO,
+        RESERVE_BTREE,
-        WATERMARK_METADATA,
+        RESERVE_PRIO,
-        WATERMARK_MOVINGGC,
+        RESERVE_MOVINGGC,
-        WATERMARK_NONE,
+        RESERVE_NONE,
-        WATERMARK_MAX
+        RESERVE_NR,
 };
 struct cache {
@@ -399,8 +401,6 @@ struct cache {
        struct kobject          kobj;
        struct block_device     *bdev;
-        unsigned                watermark[WATERMARK_MAX];
        struct task_struct      *alloc_thread;
        struct closure          prio;
@@ -429,7 +429,7 @@ struct cache {
         * because all the data they contained was overwritten), so we only
         * need to discard them before they can be moved to the free list.
         */
-        DECLARE_FIFO(long, free);
+        DECLARE_FIFO(long, free)[RESERVE_NR];
        DECLARE_FIFO(long, free_inc);
        DECLARE_FIFO(long, unused);
@@ -514,7 +514,8 @@ struct cache_set {
        uint64_t                cached_dev_sectors;
        struct closure          caching;
-        struct closure_with_waitlist sb_write;
+        struct closure          sb_write;
+        struct semaphore        sb_write_mutex;
        mempool_t               *search;
        mempool_t               *bio_meta;
@@ -629,13 +630,15 @@ struct cache_set {
 #ifdef CONFIG_BCACHE_DEBUG
        struct btree            *verify_data;
+        struct bset             *verify_ondisk;
        struct mutex            verify_lock;
 #endif
        unsigned                nr_uuids;
        struct uuid_entry       *uuids;
        BKEY_PADDED(uuid_bucket);
-        struct closure_with_waitlist uuid_write;
+        struct closure          uuid_write;
+        struct semaphore        uuid_write_mutex;
        /*
         * A btree node on disk could have too many bsets for an iterator to fit
@@ -643,13 +646,7 @@ struct cache_set {
         */
        mempool_t               *fill_iter;
-        /*
+        struct bset_sort_state  sort;
-         * btree_sort() is a merge sort and requires temporary space - single
-         * element mempool
-         */
-        struct mutex            sort_lock;
-        struct bset             *sort;
-        unsigned                sort_crit_factor;
        /* List of buckets we're currently writing data to */
        struct list_head        data_buckets;
@@ -665,7 +662,6 @@ struct cache_set {
        unsigned                congested_read_threshold_us;
        unsigned                congested_write_threshold_us;
-        struct time_stats       sort_time;
        struct time_stats       btree_gc_time;
        struct time_stats       btree_split_time;
        struct time_stats       btree_read_time;
@@ -683,9 +679,9 @@ struct cache_set {
        unsigned                error_decay;
        unsigned short          journal_delay_ms;
+        bool                    expensive_debug_checks;
        unsigned                verify:1;
        unsigned                key_merging_disabled:1;
-        unsigned                expensive_debug_checks:1;
        unsigned                gc_always_rewrite:1;
        unsigned                shrinker_disabled:1;
        unsigned                copy_gc_enabled:1;
@@ -707,13 +703,8 @@ struct bbio {
        struct bio              bio;
 };
-static inline unsigned local_clock_us(void)
-{
-        return local_clock() >> 10;
-}
 #define BTREE_PRIO              USHRT_MAX
-#define INITIAL_PRIO            32768
+#define INITIAL_PRIO            32768U
 #define btree_bytes(c)          ((c)->btree_pages * PAGE_SIZE)
 #define btree_blocks(b)                                                 \
@@ -726,21 +717,6 @@ static inline unsigned local_clock_us(void)
 #define bucket_bytes(c)         ((c)->sb.bucket_size << 9)
 #define block_bytes(c)          ((c)->sb.block_size << 9)
-#define __set_bytes(i, k)       (sizeof(*(i)) + (k) * sizeof(uint64_t))
-#define set_bytes(i)            __set_bytes(i, i->keys)
-#define __set_blocks(i, k, c)   DIV_ROUND_UP(__set_bytes(i, k), block_bytes(c))
-#define set_blocks(i, c)        __set_blocks(i, (i)->keys, c)
-#define node(i, j)              ((struct bkey *) ((i)->d + (j)))
-#define end(i)                  node(i, (i)->keys)
-#define index(i, b)                                                     \
-        ((size_t) (((void *) i - (void *) (b)->sets[0].data) /          \
-                   block_bytes(b->c)))
-#define btree_data_space(b)     (PAGE_SIZE << (b)->page_order)
 #define prios_per_bucket(c)                             \
        ((bucket_bytes(c) - sizeof(struct prio_set)) /  \
         sizeof(struct bucket_disk))
@@ -783,20 +759,34 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c,
        return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr);
 }
-/* Btree key macros */
+static inline uint8_t gen_after(uint8_t a, uint8_t b)
+{
+        uint8_t r = a - b;
+        return r > 128U ? 0 : r;
+}
-static inline void bkey_init(struct bkey *k)
+static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k,
+                                unsigned i)
 {
-        *k = ZERO_KEY;
+        return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i));
 }
+static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
+                                 unsigned i)
+{
+        return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i);
+}
+/* Btree key macros */
 /*
 * This is used for various on disk data structures - cache_sb, prio_set, bset,
 * jset: The checksum is _always_ the first 8 bytes of these structs
 */
 #define csum_set(i)                                                     \
        bch_crc64(((void *) (i)) + sizeof(uint64_t),                    \
-              ((void *) end(i)) - (((void *) (i)) + sizeof(uint64_t)))
+                  ((void *) bset_bkey_last(i)) -                        \
+                  (((void *) (i)) + sizeof(uint64_t)))
 /* Error handling macros */
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 7d388b8bb50e..4f6b5940e609 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -5,30 +5,134 @@
 * Copyright 2012 Google, Inc.
 */
-#include "bcache.h"
+#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
-#include "btree.h"
-#include "debug.h"
+#include "util.h"
+#include "bset.h"
+#include <linux/console.h>
 #include <linux/random.h>
 #include <linux/prefetch.h>
+#ifdef CONFIG_BCACHE_DEBUG
+void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set)
+{
+        struct bkey *k, *next;
+        for (k = i->start; k < bset_bkey_last(i); k = next) {
+                next = bkey_next(k);
+                printk(KERN_ERR "block %u key %zi/%u: ", set,
+                       (uint64_t *) k - i->d, i->keys);
+                if (b->ops->key_dump)
+                        b->ops->key_dump(b, k);
+                else
+                        printk("%llu:%llu\n", KEY_INODE(k), KEY_OFFSET(k));
+                if (next < bset_bkey_last(i) &&
+                    bkey_cmp(k, b->ops->is_extents ?
+                             &START_KEY(next) : next) > 0)
+                        printk(KERN_ERR "Key skipped backwards\n");
+        }
+}
+void bch_dump_bucket(struct btree_keys *b)
+{
+        unsigned i;
+        console_lock();
+        for (i = 0; i <= b->nsets; i++)
+                bch_dump_bset(b, b->set[i].data,
+                              bset_sector_offset(b, b->set[i].data));
+        console_unlock();
+}
+int __bch_count_data(struct btree_keys *b)
+{
+        unsigned ret = 0;
+        struct btree_iter iter;
+        struct bkey *k;
+        if (b->ops->is_extents)
+                for_each_key(b, k, &iter)
+                        ret += KEY_SIZE(k);
+        return ret;
+}
+void __bch_check_keys(struct btree_keys *b, const char *fmt, ...)
+{
+        va_list args;
+        struct bkey *k, *p = NULL;
+        struct btree_iter iter;
+        const char *err;
+        for_each_key(b, k, &iter) {
+                if (b->ops->is_extents) {
+                        err = "Keys out of order";
+                        if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0)
+                                goto bug;
+                        if (bch_ptr_invalid(b, k))
+                                continue;
+                        err =  "Overlapping keys";
+                        if (p && bkey_cmp(p, &START_KEY(k)) > 0)
+                                goto bug;
+                } else {
+                        if (bch_ptr_bad(b, k))
+                                continue;
+                        err = "Duplicate keys";
+                        if (p && !bkey_cmp(p, k))
+                                goto bug;
+                }
+                p = k;
+        }
+#if 0
+        err = "Key larger than btree node key";
+        if (p && bkey_cmp(p, &b->key) > 0)
+                goto bug;
+#endif
+        return;
+bug:
+        bch_dump_bucket(b);
+        va_start(args, fmt);
+        vprintk(fmt, args);
+        va_end(args);
+        panic("bch_check_keys error:  %s:\n", err);
+}
+static void bch_btree_iter_next_check(struct btree_iter *iter)
+{
+        struct bkey *k = iter->data->k, *next = bkey_next(k);
+        if (next < iter->data->end &&
+            bkey_cmp(k, iter->b->ops->is_extents ?
+                     &START_KEY(next) : next) > 0) {
+                bch_dump_bucket(iter->b);
+                panic("Key skipped backwards\n");
+        }
+}
+#else
+static inline void bch_btree_iter_next_check(struct btree_iter *iter) {}
+#endif
 /* Keylists */
-int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c)
+int __bch_keylist_realloc(struct keylist *l, unsigned u64s)
 {
        size_t oldsize = bch_keylist_nkeys(l);
-        size_t newsize = oldsize + 2 + nptrs;
+        size_t newsize = oldsize + u64s;
        uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p;
        uint64_t *new_keys;
-        /* The journalling code doesn't handle the case where the keys to insert
-         * is bigger than an empty write: If we just return -ENOMEM here,
-         * bio_insert() and bio_invalidate() will insert the keys created so far
-         * and finish the rest when the keylist is empty.
-         */
-        if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
-                return -ENOMEM;
        newsize = roundup_pow_of_two(newsize);
        if (newsize <= KEYLIST_INLINE ||
@@ -71,136 +175,6 @@ void bch_keylist_pop_front(struct keylist *l)
                bch_keylist_bytes(l));
 }
-/* Pointer validation */
-static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
-{
-        unsigned i;
-        for (i = 0; i < KEY_PTRS(k); i++)
-                if (ptr_available(c, k, i)) {
-                        struct cache *ca = PTR_CACHE(c, k, i);
-                        size_t bucket = PTR_BUCKET_NR(c, k, i);
-                        size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
-                        if (KEY_SIZE(k) + r > c->sb.bucket_size ||
-                            bucket <  ca->sb.first_bucket ||
-                            bucket >= ca->sb.nbuckets)
-                                return true;
-                }
-        return false;
-}
-bool bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k)
-{
-        char buf[80];
-        if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))
-                goto bad;
-        if (__ptr_invalid(c, k))
-                goto bad;
-        return false;
-bad:
-        bch_bkey_to_text(buf, sizeof(buf), k);
-        cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k));
-        return true;
-}
-bool bch_extent_ptr_invalid(struct cache_set *c, const struct bkey *k)
-{
-        char buf[80];
-        if (!KEY_SIZE(k))
-                return true;
-        if (KEY_SIZE(k) > KEY_OFFSET(k))
-                goto bad;
-        if (__ptr_invalid(c, k))
-                goto bad;
-        return false;
-bad:
-        bch_bkey_to_text(buf, sizeof(buf), k);
-        cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k));
-        return true;
-}
-static bool ptr_bad_expensive_checks(struct btree *b, const struct bkey *k,
-                                     unsigned ptr)
-{
-        struct bucket *g = PTR_BUCKET(b->c, k, ptr);
-        char buf[80];
-        if (mutex_trylock(&b->c->bucket_lock)) {
-                if (b->level) {
-                        if (KEY_DIRTY(k) ||
-                            g->prio != BTREE_PRIO ||
-                            (b->c->gc_mark_valid &&
-                             GC_MARK(g) != GC_MARK_METADATA))
-                                goto err;
-                } else {
-                        if (g->prio == BTREE_PRIO)
-                                goto err;
-                        if (KEY_DIRTY(k) &&
-                            b->c->gc_mark_valid &&
-                            GC_MARK(g) != GC_MARK_DIRTY)
-                                goto err;
-                }
-                mutex_unlock(&b->c->bucket_lock);
-        }
-        return false;
-err:
-        mutex_unlock(&b->c->bucket_lock);
-        bch_bkey_to_text(buf, sizeof(buf), k);
-        btree_bug(b,
-"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
-                  buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin),
-                  g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
-        return true;
-}
-bool bch_ptr_bad(struct btree *b, const struct bkey *k)
-{
-        struct bucket *g;
-        unsigned i, stale;
-        if (!bkey_cmp(k, &ZERO_KEY) ||
-            !KEY_PTRS(k) ||
-            bch_ptr_invalid(b, k))
-                return true;
-        for (i = 0; i < KEY_PTRS(k); i++) {
-                if (!ptr_available(b->c, k, i))
-                        return true;
-                g = PTR_BUCKET(b->c, k, i);
-                stale = ptr_stale(b->c, k, i);
-                btree_bug_on(stale > 96, b,
-                             "key too stale: %i, need_gc %u",
-                             stale, b->c->need_gc);
-                btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
-                             b, "stale dirty pointer");
-                if (stale)
-                        return true;
-                if (expensive_debug_checks(b->c) &&
-                    ptr_bad_expensive_checks(b, k, i))
-                        return true;
-        }
-        return false;
-}
 /* Key/pointer manipulation */
 void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src,
@@ -255,56 +229,138 @@ bool __bch_cut_back(const struct bkey *where, struct bkey *k)
        return true;
 }
-static uint64_t merge_chksums(struct bkey *l, struct bkey *r)
+/* Auxiliary search trees */
+/* 32 bits total: */
+#define BKEY_MID_BITS           3
+#define BKEY_EXPONENT_BITS      7
+#define BKEY_MANTISSA_BITS      (32 - BKEY_MID_BITS - BKEY_EXPONENT_BITS)
+#define BKEY_MANTISSA_MASK      ((1 << BKEY_MANTISSA_BITS) - 1)
+struct bkey_float {
+        unsigned        exponent:BKEY_EXPONENT_BITS;
+        unsigned        m:BKEY_MID_BITS;
+        unsigned        mantissa:BKEY_MANTISSA_BITS;
+} __packed;
+/*
+ * BSET_CACHELINE was originally intended to match the hardware cacheline size -
+ * it used to be 64, but I realized the lookup code would touch slightly less
+ * memory if it was 128.
+ *
+ * It definites the number of bytes (in struct bset) per struct bkey_float in
+ * the auxiliar search tree - when we're done searching the bset_float tree we
+ * have this many bytes left that we do a linear search over.
+ *
+ * Since (after level 5) every level of the bset_tree is on a new cacheline,
+ * we're touching one fewer cacheline in the bset tree in exchange for one more
+ * cacheline in the linear search - but the linear search might stop before it
+ * gets to the second cacheline.
+ */
+#define BSET_CACHELINE          128
+/* Space required for the btree node keys */
+static inline size_t btree_keys_bytes(struct btree_keys *b)
 {
-        return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) &
+        return PAGE_SIZE << b->page_order;
-                ~((uint64_t)1 << 63);
 }
-/* Tries to merge l and r: l should be lower than r
+static inline size_t btree_keys_cachelines(struct btree_keys *b)
- * Returns true if we were able to merge. If we did merge, l will be the merged
- * key, r will be untouched.
- */
-bool bch_bkey_try_merge(struct btree *b, struct bkey *l, struct bkey *r)
 {
-        unsigned i;
+        return btree_keys_bytes(b) / BSET_CACHELINE;
+}
-        if (key_merging_disabled(b->c))
+/* Space required for the auxiliary search trees */
-                return false;
+static inline size_t bset_tree_bytes(struct btree_keys *b)
+{
+        return btree_keys_cachelines(b) * sizeof(struct bkey_float);
+}
-        if (KEY_PTRS(l) != KEY_PTRS(r) ||
+/* Space required for the prev pointers */
-            KEY_DIRTY(l) != KEY_DIRTY(r) ||
+static inline size_t bset_prev_bytes(struct btree_keys *b)
-            bkey_cmp(l, &START_KEY(r)))
+{
-                return false;
+        return btree_keys_cachelines(b) * sizeof(uint8_t);
+}
-        for (i = 0; i < KEY_PTRS(l); i++)
+/* Memory allocation */
-                if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
-                    PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
-                        return false;
-        /* Keys with no pointers aren't restricted to one bucket and could
+void bch_btree_keys_free(struct btree_keys *b)
-         * overflow KEY_SIZE
+{
-         */
+        struct bset_tree *t = b->set;
-        if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) {
-                SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l));
-                SET_KEY_SIZE(l, USHRT_MAX);
-                bch_cut_front(l, r);
+        if (bset_prev_bytes(b) < PAGE_SIZE)
-                return false;
+                kfree(t->prev);
-        }
+        else
+                free_pages((unsigned long) t->prev,
+                           get_order(bset_prev_bytes(b)));
-        if (KEY_CSUM(l)) {
+        if (bset_tree_bytes(b) < PAGE_SIZE)
-                if (KEY_CSUM(r))
+                kfree(t->tree);
-                        l->ptr[KEY_PTRS(l)] = merge_chksums(l, r);
+        else
-                else
+                free_pages((unsigned long) t->tree,
-                        SET_KEY_CSUM(l, 0);
+                           get_order(bset_tree_bytes(b)));
-        }
-        SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r));
+        free_pages((unsigned long) t->data, b->page_order);
-        SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r));
-        return true;
+        t->prev = NULL;
+        t->tree = NULL;
+        t->data = NULL;
+}
+EXPORT_SYMBOL(bch_btree_keys_free);
+int bch_btree_keys_alloc(struct btree_keys *b, unsigned page_order, gfp_t gfp)
+{
+        struct bset_tree *t = b->set;
+        BUG_ON(t->data);
+        b->page_order = page_order;
+        t->data = (void *) __get_free_pages(gfp, b->page_order);
+        if (!t->data)
+                goto err;
+        t->tree = bset_tree_bytes(b) < PAGE_SIZE
+                ? kmalloc(bset_tree_bytes(b), gfp)
+                : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
+        if (!t->tree)
+                goto err;
+        t->prev = bset_prev_bytes(b) < PAGE_SIZE
+                ? kmalloc(bset_prev_bytes(b), gfp)
+                : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
+        if (!t->prev)
+                goto err;
+        return 0;
+err:
+        bch_btree_keys_free(b);
+        return -ENOMEM;
 }
+EXPORT_SYMBOL(bch_btree_keys_alloc);
+void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops,
+                         bool *expensive_debug_checks)
+{
+        unsigned i;
+        b->ops = ops;
+        b->expensive_debug_checks = expensive_debug_checks;
+        b->nsets = 0;
+        b->last_set_unwritten = 0;
+        /* XXX: shouldn't be needed */
+        for (i = 0; i < MAX_BSETS; i++)
+                b->set[i].size = 0;
+        /*
+         * Second loop starts at 1 because b->keys[0]->data is the memory we
+         * allocated
+         */
+        for (i = 1; i < MAX_BSETS; i++)
+                b->set[i].data = NULL;
+}
+EXPORT_SYMBOL(bch_btree_keys_init);
 /* Binary tree stuff for auxiliary search trees */
@@ -455,9 +511,11 @@ static unsigned bkey_to_cacheline(struct bset_tree *t, struct bkey *k)
        return ((void *) k - (void *) t->data) / BSET_CACHELINE;
 }
-static unsigned bkey_to_cacheline_offset(struct bkey *k)
+static unsigned bkey_to_cacheline_offset(struct bset_tree *t,
+                                         unsigned cacheline,
+                                         struct bkey *k)
 {
-        return ((size_t) k & (BSET_CACHELINE - 1)) / sizeof(uint64_t);
+        return (u64 *) k - (u64 *) cacheline_to_bkey(t, cacheline, 0);
 }
 static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j)
@@ -504,7 +562,7 @@ static void make_bfloat(struct bset_tree *t, unsigned j)
                : tree_to_prev_bkey(t, j >> ffs(j));
        struct bkey *r = is_power_of_2(j + 1)
-                ? node(t->data, t->data->keys - bkey_u64s(&t->end))
+                ? bset_bkey_idx(t->data, t->data->keys - bkey_u64s(&t->end))
                : tree_to_bkey(t, j >> (ffz(j) + 1));
        BUG_ON(m < l || m > r);
@@ -528,9 +586,9 @@ static void make_bfloat(struct bset_tree *t, unsigned j)
                f->exponent = 127;
 }
-static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
+static void bset_alloc_tree(struct btree_keys *b, struct bset_tree *t)
 {
-        if (t != b->sets) {
+        if (t != b->set) {
                unsigned j = roundup(t[-1].size,
                                     64 / sizeof(struct bkey_float));
@@ -538,33 +596,54 @@ static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
                t->prev = t[-1].prev + j;
        }
-        while (t < b->sets + MAX_BSETS)
+        while (t < b->set + MAX_BSETS)
                t++->size = 0;
 }
-static void bset_build_unwritten_tree(struct btree *b)
+static void bch_bset_build_unwritten_tree(struct btree_keys *b)
 {
-        struct bset_tree *t = b->sets + b->nsets;
+        struct bset_tree *t = bset_tree_last(b);
+        BUG_ON(b->last_set_unwritten);
+        b->last_set_unwritten = 1;
        bset_alloc_tree(b, t);
-        if (t->tree != b->sets->tree + bset_tree_space(b)) {
+        if (t->tree != b->set->tree + btree_keys_cachelines(b)) {
-                t->prev[0] = bkey_to_cacheline_offset(t->data->start);
+                t->prev[0] = bkey_to_cacheline_offset(t, 0, t->data->start);
                t->size = 1;
        }
 }
-static void bset_build_written_tree(struct btree *b)
+void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic)
+{
+        if (i != b->set->data) {
+                b->set[++b->nsets].data = i;
+                i->seq = b->set->data->seq;
+        } else
+                get_random_bytes(&i->seq, sizeof(uint64_t));
+        i->magic        = magic;
+        i->version      = 0;
+        i->keys         = 0;
+        bch_bset_build_unwritten_tree(b);
+}
+EXPORT_SYMBOL(bch_bset_init_next);
+void bch_bset_build_written_tree(struct btree_keys *b)
 {
-        struct bset_tree *t = b->sets + b->nsets;
+        struct bset_tree *t = bset_tree_last(b);
-        struct bkey *k = t->data->start;
+        struct bkey *prev = NULL, *k = t->data->start;
        unsigned j, cacheline = 1;
+        b->last_set_unwritten = 0;
        bset_alloc_tree(b, t);
        t->size = min_t(unsigned,
-                        bkey_to_cacheline(t, end(t->data)),
+                        bkey_to_cacheline(t, bset_bkey_last(t->data)),
-                        b->sets->tree + bset_tree_space(b) - t->tree);
+                        b->set->tree + btree_keys_cachelines(b) - t->tree);
        if (t->size < 2) {
                t->size = 0;
@@ -577,16 +656,14 @@ static void bset_build_written_tree(struct btree *b)
        for (j = inorder_next(0, t->size);
             j;
             j = inorder_next(j, t->size)) {
-                while (bkey_to_cacheline(t, k) != cacheline)
+                while (bkey_to_cacheline(t, k) < cacheline)
-                        k = bkey_next(k);
+                        prev = k, k = bkey_next(k);
-                t->prev[j] = bkey_u64s(k);
+                t->prev[j] = bkey_u64s(prev);
-                k = bkey_next(k);
+                t->tree[j].m = bkey_to_cacheline_offset(t, cacheline++, k);
-                cacheline++;
-                t->tree[j].m = bkey_to_cacheline_offset(k);
        }
-        while (bkey_next(k) != end(t->data))
+        while (bkey_next(k) != bset_bkey_last(t->data))
                k = bkey_next(k);
        t->end = *k;
@@ -597,14 +674,17 @@ static void bset_build_written_tree(struct btree *b)
             j = inorder_next(j, t->size))
                make_bfloat(t, j);
 }
+EXPORT_SYMBOL(bch_bset_build_written_tree);
-void bch_bset_fix_invalidated_key(struct btree *b, struct bkey *k)
+/* Insert */
+void bch_bset_fix_invalidated_key(struct btree_keys *b, struct bkey *k)
 {
        struct bset_tree *t;
        unsigned inorder, j = 1;
-        for (t = b->sets; t <= &b->sets[b->nsets]; t++)
+        for (t = b->set; t <= bset_tree_last(b); t++)
-                if (k < end(t->data))
+                if (k < bset_bkey_last(t->data))
                        goto found_set;
        BUG();
@@ -617,7 +697,7 @@ found_set:
        if (k == t->data->start)
                goto fix_left;
-        if (bkey_next(k) == end(t->data)) {
+        if (bkey_next(k) == bset_bkey_last(t->data)) {
                t->end = *k;
                goto fix_right;
        }
@@ -642,10 +722,12 @@ fix_right:	do {
                        j = j * 2 + 1;
                } while (j < t->size);
 }
+EXPORT_SYMBOL(bch_bset_fix_invalidated_key);
-void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k)
+static void bch_bset_fix_lookup_table(struct btree_keys *b,
+                                      struct bset_tree *t,
+                                      struct bkey *k)
 {
-        struct bset_tree *t = &b->sets[b->nsets];
        unsigned shift = bkey_u64s(k);
        unsigned j = bkey_to_cacheline(t, k);
@@ -657,8 +739,8 @@ void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k)
         * lookup table for the first key that is strictly greater than k:
         * it's either k's cacheline or the next one
         */
-        if (j < t->size &&
+        while (j < t->size &&
-            table_to_bkey(t, j) <= k)
+               table_to_bkey(t, j) <= k)
                j++;
        /* Adjust all the lookup table entries, and find a new key for any that
@@ -673,54 +755,124 @@ void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k)
                        while (k < cacheline_to_bkey(t, j, 0))
                                k = bkey_next(k);
-                        t->prev[j] = bkey_to_cacheline_offset(k);
+                        t->prev[j] = bkey_to_cacheline_offset(t, j, k);
                }
        }
-        if (t->size == b->sets->tree + bset_tree_space(b) - t->tree)
+        if (t->size == b->set->tree + btree_keys_cachelines(b) - t->tree)
                return;
        /* Possibly add a new entry to the end of the lookup table */
        for (k = table_to_bkey(t, t->size - 1);
-             k != end(t->data);
+             k != bset_bkey_last(t->data);
             k = bkey_next(k))
                if (t->size == bkey_to_cacheline(t, k)) {
-                        t->prev[t->size] = bkey_to_cacheline_offset(k);
+                        t->prev[t->size] = bkey_to_cacheline_offset(t, t->size, k);
                        t->size++;
                }
 }
-void bch_bset_init_next(struct btree *b)
+/*
+ * Tries to merge l and r: l should be lower than r
+ * Returns true if we were able to merge. If we did merge, l will be the merged
+ * key, r will be untouched.
+ */
+bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r)
 {
-        struct bset *i = write_block(b);
+        if (!b->ops->key_merge)
+                return false;
-        if (i != b->sets[0].data) {
+        /*
-                b->sets[++b->nsets].data = i;
+         * Generic header checks
-                i->seq = b->sets[0].data->seq;
+         * Assumes left and right are in order
-        } else
+         * Left and right must be exactly aligned
-                get_random_bytes(&i->seq, sizeof(uint64_t));
+         */
+        if (!bch_bkey_equal_header(l, r) ||
+             bkey_cmp(l, &START_KEY(r)))
+                return false;
-        i->magic        = bset_magic(&b->c->sb);
+        return b->ops->key_merge(b, l, r);
-        i->version      = 0;
+}
-        i->keys         = 0;
+EXPORT_SYMBOL(bch_bkey_try_merge);
-        bset_build_unwritten_tree(b);
+void bch_bset_insert(struct btree_keys *b, struct bkey *where,
+                     struct bkey *insert)
+{
+        struct bset_tree *t = bset_tree_last(b);
+        BUG_ON(!b->last_set_unwritten);
+        BUG_ON(bset_byte_offset(b, t->data) +
+               __set_bytes(t->data, t->data->keys + bkey_u64s(insert)) >
+               PAGE_SIZE << b->page_order);
+        memmove((uint64_t *) where + bkey_u64s(insert),
+                where,
+                (void *) bset_bkey_last(t->data) - (void *) where);
+        t->data->keys += bkey_u64s(insert);
+        bkey_copy(where, insert);
+        bch_bset_fix_lookup_table(b, t, where);
 }
+EXPORT_SYMBOL(bch_bset_insert);
+unsigned bch_btree_insert_key(struct btree_keys *b, struct bkey *k,
+                              struct bkey *replace_key)
+{
+        unsigned status = BTREE_INSERT_STATUS_NO_INSERT;
+        struct bset *i = bset_tree_last(b)->data;
+        struct bkey *m, *prev = NULL;
+        struct btree_iter iter;
+        BUG_ON(b->ops->is_extents && !KEY_SIZE(k));
+        m = bch_btree_iter_init(b, &iter, b->ops->is_extents
+                                ? PRECEDING_KEY(&START_KEY(k))
+                                : PRECEDING_KEY(k));
+        if (b->ops->insert_fixup(b, k, &iter, replace_key))
+                return status;
+        status = BTREE_INSERT_STATUS_INSERT;
+        while (m != bset_bkey_last(i) &&
+               bkey_cmp(k, b->ops->is_extents ? &START_KEY(m) : m) > 0)
+                prev = m, m = bkey_next(m);
+        /* prev is in the tree, if we merge we're done */
+        status = BTREE_INSERT_STATUS_BACK_MERGE;
+        if (prev &&
+            bch_bkey_try_merge(b, prev, k))
+                goto merged;
+#if 0
+        status = BTREE_INSERT_STATUS_OVERWROTE;
+        if (m != bset_bkey_last(i) &&
+            KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
+                goto copy;
+#endif
+        status = BTREE_INSERT_STATUS_FRONT_MERGE;
+        if (m != bset_bkey_last(i) &&
+            bch_bkey_try_merge(b, k, m))
+                goto copy;
+        bch_bset_insert(b, m, k);
+copy:   bkey_copy(m, k);
+merged:
+        return status;
+}
+EXPORT_SYMBOL(bch_btree_insert_key);
+/* Lookup */
 struct bset_search_iter {
        struct bkey *l, *r;
 };
-static struct bset_search_iter bset_search_write_set(struct btree *b,
+static struct bset_search_iter bset_search_write_set(struct bset_tree *t,
-                                                     struct bset_tree *t,
                                                     const struct bkey *search)
 {
        unsigned li = 0, ri = t->size;
-        BUG_ON(!b->nsets &&
-               t->size < bkey_to_cacheline(t, end(t->data)));
        while (li + 1 != ri) {
                unsigned m = (li + ri) >> 1;
@@ -732,12 +884,11 @@ static struct bset_search_iter bset_search_write_set(struct btree *b,
        return (struct bset_search_iter) {
                table_to_bkey(t, li),
-                ri < t->size ? table_to_bkey(t, ri) : end(t->data)
+                ri < t->size ? table_to_bkey(t, ri) : bset_bkey_last(t->data)
        };
 }
-static struct bset_search_iter bset_search_tree(struct btree *b,
+static struct bset_search_iter bset_search_tree(struct bset_tree *t,
-                                                struct bset_tree *t,
                                                const struct bkey *search)
 {
        struct bkey *l, *r;
@@ -784,7 +935,7 @@ static struct bset_search_iter bset_search_tree(struct btree *b,
                        f = &t->tree[inorder_next(j, t->size)];
                        r = cacheline_to_bkey(t, inorder, f->m);
                } else
-                        r = end(t->data);
+                        r = bset_bkey_last(t->data);
        } else {
                r = cacheline_to_bkey(t, inorder, f->m);
@@ -798,7 +949,7 @@ static struct bset_search_iter bset_search_tree(struct btree *b,
        return (struct bset_search_iter) {l, r};
 }
-struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
+struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t,
                               const struct bkey *search)
 {
        struct bset_search_iter i;
@@ -820,7 +971,7 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
        if (unlikely(!t->size)) {
                i.l = t->data->start;
-                i.r = end(t->data);
+                i.r = bset_bkey_last(t->data);
        } else if (bset_written(b, t)) {
                /*
                 * Each node in the auxiliary search tree covers a certain range
@@ -830,23 +981,27 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
                 */
                if (unlikely(bkey_cmp(search, &t->end) >= 0))
-                        return end(t->data);
+                        return bset_bkey_last(t->data);
                if (unlikely(bkey_cmp(search, t->data->start) < 0))
                        return t->data->start;
-                i = bset_search_tree(b, t, search);
+                i = bset_search_tree(t, search);
-        } else
+        } else {
-                i = bset_search_write_set(b, t, search);
+                BUG_ON(!b->nsets &&
+                       t->size < bkey_to_cacheline(t, bset_bkey_last(t->data)));
-        if (expensive_debug_checks(b->c)) {
+                i = bset_search_write_set(t, search);
+        }
+        if (btree_keys_expensive_checks(b)) {
                BUG_ON(bset_written(b, t) &&
                       i.l != t->data->start &&
                       bkey_cmp(tree_to_prev_bkey(t,
                          inorder_to_tree(bkey_to_cacheline(t, i.l), t)),
                                search) > 0);
-                BUG_ON(i.r != end(t->data) &&
+                BUG_ON(i.r != bset_bkey_last(t->data) &&
                       bkey_cmp(i.r, search) <= 0);
        }
@@ -856,22 +1011,17 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
        return i.l;
 }
+EXPORT_SYMBOL(__bch_bset_search);
 /* Btree iterator */
-/*
+typedef bool (btree_iter_cmp_fn)(struct btree_iter_set,
- * Returns true if l > r - unless l == r, in which case returns true if l is
+                                 struct btree_iter_set);
- * older than r.
- *
- * Necessary for btree_sort_fixup() - if there are multiple keys that compare
- * equal in different sets, we have to process them newest to oldest.
- */
 static inline bool btree_iter_cmp(struct btree_iter_set l,
                                  struct btree_iter_set r)
 {
-        int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k));
+        return bkey_cmp(l.k, r.k) > 0;
-        return c ? c > 0 : l.k < r.k;
 }
 static inline bool btree_iter_end(struct btree_iter *iter)
@@ -888,8 +1038,10 @@ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k,
                                 btree_iter_cmp));
 }
-struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter,
+static struct bkey *__bch_btree_iter_init(struct btree_keys *b,
-                                   struct bkey *search, struct bset_tree *start)
+                                          struct btree_iter *iter,
+                                          struct bkey *search,
+                                          struct bset_tree *start)
 {
        struct bkey *ret = NULL;
        iter->size = ARRAY_SIZE(iter->data);
@@ -899,15 +1051,24 @@ struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter,
        iter->b = b;
 #endif
-        for (; start <= &b->sets[b->nsets]; start++) {
+        for (; start <= bset_tree_last(b); start++) {
                ret = bch_bset_search(b, start, search);
-                bch_btree_iter_push(iter, ret, end(start->data));
+                bch_btree_iter_push(iter, ret, bset_bkey_last(start->data));
        }
        return ret;
 }
-struct bkey *bch_btree_iter_next(struct btree_iter *iter)
+struct bkey *bch_btree_iter_init(struct btree_keys *b,
+                                 struct btree_iter *iter,
+                                 struct bkey *search)
+{
+        return __bch_btree_iter_init(b, iter, search, b->set);
+}
+EXPORT_SYMBOL(bch_btree_iter_init);
+static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter,
+                                                 btree_iter_cmp_fn *cmp)
 {
        struct btree_iter_set unused;
        struct bkey *ret = NULL;
@@ -924,16 +1085,23 @@ struct bkey *bch_btree_iter_next(struct btree_iter *iter)
                }
                if (iter->data->k == iter->data->end)
-                        heap_pop(iter, unused, btree_iter_cmp);
+                        heap_pop(iter, unused, cmp);
                else
-                        heap_sift(iter, 0, btree_iter_cmp);
+                        heap_sift(iter, 0, cmp);
        }
        return ret;
 }
+struct bkey *bch_btree_iter_next(struct btree_iter *iter)
+{
+        return __bch_btree_iter_next(iter, btree_iter_cmp);
+}
+EXPORT_SYMBOL(bch_btree_iter_next);
 struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
-                                        struct btree *b, ptr_filter_fn fn)
+                                        struct btree_keys *b, ptr_filter_fn fn)
 {
        struct bkey *ret;
@@ -946,70 +1114,58 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
 /* Mergesort */
-static void sort_key_next(struct btree_iter *iter,
+void bch_bset_sort_state_free(struct bset_sort_state *state)
-                          struct btree_iter_set *i)
 {
-        i->k = bkey_next(i->k);
+        if (state->pool)
+                mempool_destroy(state->pool);
-        if (i->k == i->end)
-                *i = iter->data[--iter->used];
 }
-static void btree_sort_fixup(struct btree_iter *iter)
+int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order)
 {
-        while (iter->used > 1) {
+        spin_lock_init(&state->time.lock);
-                struct btree_iter_set *top = iter->data, *i = top + 1;
-                if (iter->used > 2 &&
+        state->page_order = page_order;
-                    btree_iter_cmp(i[0], i[1]))
+        state->crit_factor = int_sqrt(1 << page_order);
-                        i++;
-                if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0)
+        state->pool = mempool_create_page_pool(1, page_order);
-                        break;
+        if (!state->pool)
+                return -ENOMEM;
-                if (!KEY_SIZE(i->k)) {
-                        sort_key_next(iter, i);
-                        heap_sift(iter, i - top, btree_iter_cmp);
-                        continue;
-                }
-                if (top->k > i->k) {
-                        if (bkey_cmp(top->k, i->k) >= 0)
-                                sort_key_next(iter, i);
-                        else
-                                bch_cut_front(top->k, i->k);
-                        heap_sift(iter, i - top, btree_iter_cmp);
+        return 0;
-                } else {
-                        /* can't happen because of comparison func */
-                        BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k)));
-                        bch_cut_back(&START_KEY(i->k), top->k);
-                }
-        }
 }
+EXPORT_SYMBOL(bch_bset_sort_state_init);
-static void btree_mergesort(struct btree *b, struct bset *out,
+static void btree_mergesort(struct btree_keys *b, struct bset *out,
                            struct btree_iter *iter,
                            bool fixup, bool remove_stale)
 {
+        int i;
        struct bkey *k, *last = NULL;
-        bool (*bad)(struct btree *, const struct bkey *) = remove_stale
+        BKEY_PADDED(k) tmp;
+        bool (*bad)(struct btree_keys *, const struct bkey *) = remove_stale
                ? bch_ptr_bad
                : bch_ptr_invalid;
+        /* Heapify the iterator, using our comparison function */
+        for (i = iter->used / 2 - 1; i >= 0; --i)
+                heap_sift(iter, i, b->ops->sort_cmp);
        while (!btree_iter_end(iter)) {
-                if (fixup && !b->level)
+                if (b->ops->sort_fixup && fixup)
-                        btree_sort_fixup(iter);
+                        k = b->ops->sort_fixup(iter, &tmp.k);
+                else
+                        k = NULL;
+                if (!k)
+                        k = __bch_btree_iter_next(iter, b->ops->sort_cmp);
-                k = bch_btree_iter_next(iter);
                if (bad(b, k))
                        continue;
                if (!last) {
                        last = out->start;
                        bkey_copy(last, k);
-                } else if (b->level ||
+                } else if (!bch_bkey_try_merge(b, last, k)) {
-                           !bch_bkey_try_merge(b, last, k)) {
                        last = bkey_next(last);
                        bkey_copy(last, k);
                }
@@ -1020,27 +1176,27 @@ static void btree_mergesort(struct btree *b, struct bset *out,
        pr_debug("sorted %i keys", out->keys);
 }
-static void __btree_sort(struct btree *b, struct btree_iter *iter,
+static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
-                         unsigned start, unsigned order, bool fixup)
+                         unsigned start, unsigned order, bool fixup,
+                         struct bset_sort_state *state)
 {
        uint64_t start_time;
-        bool remove_stale = !b->written;
+        bool used_mempool = false;
        struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO,
                                                     order);
        if (!out) {
-                mutex_lock(&b->c->sort_lock);
+                BUG_ON(order > state->page_order);
-                out = b->c->sort;
-                order = ilog2(bucket_pages(b->c));
+                out = page_address(mempool_alloc(state->pool, GFP_NOIO));
+                used_mempool = true;
+                order = state->page_order;
        }
        start_time = local_clock();
-        btree_mergesort(b, out, iter, fixup, remove_stale);
+        btree_mergesort(b, out, iter, fixup, false);
        b->nsets = start;
-        if (!fixup && !start && b->written)
-                bch_btree_verify(b, out);
        if (!start && order == b->page_order) {
                /*
                 * Our temporary buffer is the same size as the btree node's
@@ -1048,84 +1204,76 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter,
                 * memcpy()
                 */
-                out->magic      = bset_magic(&b->c->sb);
+                out->magic      = b->set->data->magic;
-                out->seq        = b->sets[0].data->seq;
+                out->seq        = b->set->data->seq;
-                out->version    = b->sets[0].data->version;
+                out->version    = b->set->data->version;
-                swap(out, b->sets[0].data);
+                swap(out, b->set->data);
-                if (b->c->sort == b->sets[0].data)
-                        b->c->sort = out;
        } else {
-                b->sets[start].data->keys = out->keys;
+                b->set[start].data->keys = out->keys;
-                memcpy(b->sets[start].data->start, out->start,
+                memcpy(b->set[start].data->start, out->start,
-                       (void *) end(out) - (void *) out->start);
+                       (void *) bset_bkey_last(out) - (void *) out->start);
        }
-        if (out == b->c->sort)
+        if (used_mempool)
-                mutex_unlock(&b->c->sort_lock);
+                mempool_free(virt_to_page(out), state->pool);
        else
                free_pages((unsigned long) out, order);
-        if (b->written)
+        bch_bset_build_written_tree(b);
-                bset_build_written_tree(b);
        if (!start)
-                bch_time_stats_update(&b->c->sort_time, start_time);
+                bch_time_stats_update(&state->time, start_time);
 }
-void bch_btree_sort_partial(struct btree *b, unsigned start)
+void bch_btree_sort_partial(struct btree_keys *b, unsigned start,
+                            struct bset_sort_state *state)
 {
        size_t order = b->page_order, keys = 0;
        struct btree_iter iter;
        int oldsize = bch_count_data(b);
-        __bch_btree_iter_init(b, &iter, NULL, &b->sets[start]);
+        __bch_btree_iter_init(b, &iter, NULL, &b->set[start]);
-        BUG_ON(b->sets[b->nsets].data == write_block(b) &&
-               (b->sets[b->nsets].size || b->nsets));
        if (start) {
                unsigned i;
                for (i = start; i <= b->nsets; i++)
-                        keys += b->sets[i].data->keys;
+                        keys += b->set[i].data->keys;
-                order = roundup_pow_of_two(__set_bytes(b->sets->data,
+                order = get_order(__set_bytes(b->set->data, keys));
-                                                       keys)) / PAGE_SIZE;
-                if (order)
-                        order = ilog2(order);
        }
-        __btree_sort(b, &iter, start, order, false);
+        __btree_sort(b, &iter, start, order, false, state);
-        EBUG_ON(b->written && oldsize >= 0 && bch_count_data(b) != oldsize);
+        EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize);
 }
+EXPORT_SYMBOL(bch_btree_sort_partial);
-void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter)
+void bch_btree_sort_and_fix_extents(struct btree_keys *b,
+                                    struct btree_iter *iter,
+                                    struct bset_sort_state *state)
 {
-        BUG_ON(!b->written);
+        __btree_sort(b, iter, 0, b->page_order, true, state);
-        __btree_sort(b, iter, 0, b->page_order, true);
 }
-void bch_btree_sort_into(struct btree *b, struct btree *new)
+void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new,
+                         struct bset_sort_state *state)
 {
        uint64_t start_time = local_clock();
        struct btree_iter iter;
        bch_btree_iter_init(b, &iter, NULL);
-        btree_mergesort(b, new->sets->data, &iter, false, true);
+        btree_mergesort(b, new->set->data, &iter, false, true);
-        bch_time_stats_update(&b->c->sort_time, start_time);
+        bch_time_stats_update(&state->time, start_time);
-        bkey_copy_key(&new->key, &b->key);
+        new->set->size = 0; // XXX: why?
-        new->sets->size = 0;
 }
 #define SORT_CRIT       (4096 / sizeof(uint64_t))
-void bch_btree_sort_lazy(struct btree *b)
+void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state)
 {
        unsigned crit = SORT_CRIT;
        int i;
@@ -1134,50 +1282,32 @@ void bch_btree_sort_lazy(struct btree *b)
        if (!b->nsets)
                goto out;
-        /* If not a leaf node, always sort */
-        if (b->level) {
-                bch_btree_sort(b);
-                return;
-        }
        for (i = b->nsets - 1; i >= 0; --i) {
-                crit *= b->c->sort_crit_factor;
+                crit *= state->crit_factor;
-                if (b->sets[i].data->keys < crit) {
+                if (b->set[i].data->keys < crit) {
-                        bch_btree_sort_partial(b, i);
+                        bch_btree_sort_partial(b, i, state);
                        return;
                }
        }
        /* Sort if we'd overflow */
        if (b->nsets + 1 == MAX_BSETS) {
-                bch_btree_sort(b);
+                bch_btree_sort(b, state);
                return;
        }
 out:
-        bset_build_written_tree(b);
+        bch_bset_build_written_tree(b);
 }
+EXPORT_SYMBOL(bch_btree_sort_lazy);
-/* Sysfs stuff */
+void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *stats)
-struct bset_stats {
-        struct btree_op op;
-        size_t nodes;
-        size_t sets_written, sets_unwritten;
-        size_t bytes_written, bytes_unwritten;
-        size_t floats, failed;
-};
-static int btree_bset_stats(struct btree_op *op, struct btree *b)
 {
-        struct bset_stats *stats = container_of(op, struct bset_stats, op);
        unsigned i;
-        stats->nodes++;
        for (i = 0; i <= b->nsets; i++) {
-                struct bset_tree *t = &b->sets[i];
+                struct bset_tree *t = &b->set[i];
                size_t bytes = t->data->keys * sizeof(uint64_t);
                size_t j;
@@ -1195,32 +1325,4 @@ static int btree_bset_stats(struct btree_op *op, struct btree *b)
                        stats->bytes_unwritten += bytes;
                }
        }
-        return MAP_CONTINUE;
-}
-int bch_bset_print_stats(struct cache_set *c, char *buf)
-{
-        struct bset_stats t;
-        int ret;
-        memset(&t, 0, sizeof(struct bset_stats));
-        bch_btree_op_init(&t.op, -1);
-        ret = bch_btree_map_nodes(&t.op, c, &ZERO_KEY, btree_bset_stats);
-        if (ret < 0)
-                return ret;
-        return snprintf(buf, PAGE_SIZE,
-                        "btree nodes:           %zu\n"
-                        "written sets:          %zu\n"
-                        "unwritten sets:                %zu\n"
-                        "written key bytes:     %zu\n"
-                        "unwritten key bytes:   %zu\n"
-                        "floats:                        %zu\n"
-                        "failed:                        %zu\n",
-                        t.nodes,
-                        t.sets_written, t.sets_unwritten,
-                        t.bytes_written, t.bytes_unwritten,
-                        t.floats, t.failed);
 }
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index 1d3c24f9fa0e..003260f4ddf6 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -1,7 +1,11 @@
 #ifndef _BCACHE_BSET_H
 #define _BCACHE_BSET_H
-#include <linux/slab.h>
+#include <linux/bcache.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include "util.h" /* for time_stats */
 /*
 * BKEYS:
@@ -142,20 +146,13 @@
 * first key in that range of bytes again.
 */
-/* Btree key comparison/iteration */
+struct btree_keys;
+struct btree_iter;
+struct btree_iter_set;
+struct bkey_float;
 #define MAX_BSETS               4U
-struct btree_iter {
-        size_t size, used;
-#ifdef CONFIG_BCACHE_DEBUG
-        struct btree *b;
-#endif
-        struct btree_iter_set {
-                struct bkey *k, *end;
-        } data[MAX_BSETS];
-};
 struct bset_tree {
        /*
         * We construct a binary tree in an array as if the array
@@ -165,14 +162,14 @@ struct bset_tree {
         */
        /* size of the binary tree and prev array */
-        unsigned        size;
+        unsigned                size;
        /* function of size - precalculated for to_inorder() */
-        unsigned        extra;
+        unsigned                extra;
        /* copy of the last key in the set */
-        struct bkey     end;
+        struct bkey             end;
-        struct bkey_float *tree;
+        struct bkey_float       *tree;
        /*
         * The nodes in the bset tree point to specific keys - this
@@ -182,12 +179,219 @@ struct bset_tree {
         * to keep bkey_float to 4 bytes and prev isn't used in the fast
         * path.
         */
-        uint8_t         *prev;
+        uint8_t                 *prev;
        /* The actual btree node, with pointers to each sorted set */
-        struct bset     *data;
+        struct bset             *data;
+};
+struct btree_keys_ops {
+        bool            (*sort_cmp)(struct btree_iter_set,
+                                    struct btree_iter_set);
+        struct bkey     *(*sort_fixup)(struct btree_iter *, struct bkey *);
+        bool            (*insert_fixup)(struct btree_keys *, struct bkey *,
+                                        struct btree_iter *, struct bkey *);
+        bool            (*key_invalid)(struct btree_keys *,
+                                       const struct bkey *);
+        bool            (*key_bad)(struct btree_keys *, const struct bkey *);
+        bool            (*key_merge)(struct btree_keys *,
+                                     struct bkey *, struct bkey *);
+        void            (*key_to_text)(char *, size_t, const struct bkey *);
+        void            (*key_dump)(struct btree_keys *, const struct bkey *);
+        /*
+         * Only used for deciding whether to use START_KEY(k) or just the key
+         * itself in a couple places
+         */
+        bool            is_extents;
+};
+struct btree_keys {
+        const struct btree_keys_ops     *ops;
+        uint8_t                 page_order;
+        uint8_t                 nsets;
+        unsigned                last_set_unwritten:1;
+        bool                    *expensive_debug_checks;
+        /*
+         * Sets of sorted keys - the real btree node - plus a binary search tree
+         *
+         * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
+         * to the memory we have allocated for this btree node. Additionally,
+         * set[0]->data points to the entire btree node as it exists on disk.
+         */
+        struct bset_tree        set[MAX_BSETS];
+};
+static inline struct bset_tree *bset_tree_last(struct btree_keys *b)
+{
+        return b->set + b->nsets;
+}
+static inline bool bset_written(struct btree_keys *b, struct bset_tree *t)
+{
+        return t <= b->set + b->nsets - b->last_set_unwritten;
+}
+static inline bool bkey_written(struct btree_keys *b, struct bkey *k)
+{
+        return !b->last_set_unwritten || k < b->set[b->nsets].data->start;
+}
+static inline unsigned bset_byte_offset(struct btree_keys *b, struct bset *i)
+{
+        return ((size_t) i) - ((size_t) b->set->data);
+}
+static inline unsigned bset_sector_offset(struct btree_keys *b, struct bset *i)
+{
+        return bset_byte_offset(b, i) >> 9;
+}
+#define __set_bytes(i, k)       (sizeof(*(i)) + (k) * sizeof(uint64_t))
+#define set_bytes(i)            __set_bytes(i, i->keys)
+#define __set_blocks(i, k, block_bytes)                         \
+        DIV_ROUND_UP(__set_bytes(i, k), block_bytes)
+#define set_blocks(i, block_bytes)                              \
+        __set_blocks(i, (i)->keys, block_bytes)
+static inline size_t bch_btree_keys_u64s_remaining(struct btree_keys *b)
+{
+        struct bset_tree *t = bset_tree_last(b);
+        BUG_ON((PAGE_SIZE << b->page_order) <
+               (bset_byte_offset(b, t->data) + set_bytes(t->data)));
+        if (!b->last_set_unwritten)
+                return 0;
+        return ((PAGE_SIZE << b->page_order) -
+                (bset_byte_offset(b, t->data) + set_bytes(t->data))) /
+                sizeof(u64);
+}
+static inline struct bset *bset_next_set(struct btree_keys *b,
+                                         unsigned block_bytes)
+{
+        struct bset *i = bset_tree_last(b)->data;
+        return ((void *) i) + roundup(set_bytes(i), block_bytes);
+}
+void bch_btree_keys_free(struct btree_keys *);
+int bch_btree_keys_alloc(struct btree_keys *, unsigned, gfp_t);
+void bch_btree_keys_init(struct btree_keys *, const struct btree_keys_ops *,
+                         bool *);
+void bch_bset_init_next(struct btree_keys *, struct bset *, uint64_t);
+void bch_bset_build_written_tree(struct btree_keys *);
+void bch_bset_fix_invalidated_key(struct btree_keys *, struct bkey *);
+bool bch_bkey_try_merge(struct btree_keys *, struct bkey *, struct bkey *);
+void bch_bset_insert(struct btree_keys *, struct bkey *, struct bkey *);
+unsigned bch_btree_insert_key(struct btree_keys *, struct bkey *,
+                              struct bkey *);
+enum {
+        BTREE_INSERT_STATUS_NO_INSERT = 0,
+        BTREE_INSERT_STATUS_INSERT,
+        BTREE_INSERT_STATUS_BACK_MERGE,
+        BTREE_INSERT_STATUS_OVERWROTE,
+        BTREE_INSERT_STATUS_FRONT_MERGE,
 };
+/* Btree key iteration */
+struct btree_iter {
+        size_t size, used;
+#ifdef CONFIG_BCACHE_DEBUG
+        struct btree_keys *b;
+#endif
+        struct btree_iter_set {
+                struct bkey *k, *end;
+        } data[MAX_BSETS];
+};
+typedef bool (*ptr_filter_fn)(struct btree_keys *, const struct bkey *);
+struct bkey *bch_btree_iter_next(struct btree_iter *);
+struct bkey *bch_btree_iter_next_filter(struct btree_iter *,
+                                        struct btree_keys *, ptr_filter_fn);
+void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *);
+struct bkey *bch_btree_iter_init(struct btree_keys *, struct btree_iter *,
+                                 struct bkey *);
+struct bkey *__bch_bset_search(struct btree_keys *, struct bset_tree *,
+                               const struct bkey *);
+/*
+ * Returns the first key that is strictly greater than search
+ */
+static inline struct bkey *bch_bset_search(struct btree_keys *b,
+                                           struct bset_tree *t,
+                                           const struct bkey *search)
+{
+        return search ? __bch_bset_search(b, t, search) : t->data->start;
+}
+#define for_each_key_filter(b, k, iter, filter)                         \
+        for (bch_btree_iter_init((b), (iter), NULL);                    \
+             ((k) = bch_btree_iter_next_filter((iter), (b), filter));)
+#define for_each_key(b, k, iter)                                        \
+        for (bch_btree_iter_init((b), (iter), NULL);                    \
+             ((k) = bch_btree_iter_next(iter));)
+/* Sorting */
+struct bset_sort_state {
+        mempool_t               *pool;
+        unsigned                page_order;
+        unsigned                crit_factor;
+        struct time_stats       time;
+};
+void bch_bset_sort_state_free(struct bset_sort_state *);
+int bch_bset_sort_state_init(struct bset_sort_state *, unsigned);
+void bch_btree_sort_lazy(struct btree_keys *, struct bset_sort_state *);
+void bch_btree_sort_into(struct btree_keys *, struct btree_keys *,
+                         struct bset_sort_state *);
+void bch_btree_sort_and_fix_extents(struct btree_keys *, struct btree_iter *,
+                                    struct bset_sort_state *);
+void bch_btree_sort_partial(struct btree_keys *, unsigned,
+                            struct bset_sort_state *);
+static inline void bch_btree_sort(struct btree_keys *b,
+                                  struct bset_sort_state *state)
+{
+        bch_btree_sort_partial(b, 0, state);
+}
+struct bset_stats {
+        size_t sets_written, sets_unwritten;
+        size_t bytes_written, bytes_unwritten;
+        size_t floats, failed;
+};
+void bch_btree_keys_stats(struct btree_keys *, struct bset_stats *);
+/* Bkey utility code */
+#define bset_bkey_last(i)       bkey_idx((struct bkey *) (i)->d, (i)->keys)
+static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned idx)
+{
+        return bkey_idx(i->start, idx);
+}
+static inline void bkey_init(struct bkey *k)
+{
+        *k = ZERO_KEY;
+}
 static __always_inline int64_t bkey_cmp(const struct bkey *l,
                                        const struct bkey *r)
 {
@@ -196,6 +400,62 @@ static __always_inline int64_t bkey_cmp(const struct bkey *l,
                : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r);
 }
+void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *,
+                              unsigned);
+bool __bch_cut_front(const struct bkey *, struct bkey *);
+bool __bch_cut_back(const struct bkey *, struct bkey *);
+static inline bool bch_cut_front(const struct bkey *where, struct bkey *k)
+{
+        BUG_ON(bkey_cmp(where, k) > 0);
+        return __bch_cut_front(where, k);
+}
+static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
+{
+        BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0);
+        return __bch_cut_back(where, k);
+}
+#define PRECEDING_KEY(_k)                                       \
+({                                                              \
+        struct bkey *_ret = NULL;                               \
+                                                                \
+        if (KEY_INODE(_k) || KEY_OFFSET(_k)) {                  \
+                _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0);  \
+                                                                \
+                if (!_ret->low)                                 \
+                        _ret->high--;                           \
+                _ret->low--;                                    \
+        }                                                       \
+                                                                \
+        _ret;                                                   \
+})
+static inline bool bch_ptr_invalid(struct btree_keys *b, const struct bkey *k)
+{
+        return b->ops->key_invalid(b, k);
+}
+static inline bool bch_ptr_bad(struct btree_keys *b, const struct bkey *k)
+{
+        return b->ops->key_bad(b, k);
+}
+static inline void bch_bkey_to_text(struct btree_keys *b, char *buf,
+                                    size_t size, const struct bkey *k)
+{
+        return b->ops->key_to_text(buf, size, k);
+}
+static inline bool bch_bkey_equal_header(const struct bkey *l,
+                                         const struct bkey *r)
+{
+        return (KEY_DIRTY(l) == KEY_DIRTY(r) &&
+                KEY_PTRS(l) == KEY_PTRS(r) &&
+                KEY_CSUM(l) == KEY_CSUM(l));
+}
 /* Keylists */
 struct keylist {
@@ -257,136 +517,44 @@ static inline size_t bch_keylist_bytes(struct keylist *l)
 struct bkey *bch_keylist_pop(struct keylist *);
 void bch_keylist_pop_front(struct keylist *);
-int bch_keylist_realloc(struct keylist *, int, struct cache_set *);
+int __bch_keylist_realloc(struct keylist *, unsigned);
-void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *,
-                              unsigned);
-bool __bch_cut_front(const struct bkey *, struct bkey *);
-bool __bch_cut_back(const struct bkey *, struct bkey *);
-static inline bool bch_cut_front(const struct bkey *where, struct bkey *k)
+/* Debug stuff */
-{
-        BUG_ON(bkey_cmp(where, k) > 0);
-        return __bch_cut_front(where, k);
-}
-static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
+#ifdef CONFIG_BCACHE_DEBUG
-{
-        BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0);
-        return __bch_cut_back(where, k);
-}
-const char *bch_ptr_status(struct cache_set *, const struct bkey *);
-bool bch_btree_ptr_invalid(struct cache_set *, const struct bkey *);
-bool bch_extent_ptr_invalid(struct cache_set *, const struct bkey *);
-bool bch_ptr_bad(struct btree *, const struct bkey *);
-static inline uint8_t gen_after(uint8_t a, uint8_t b)
-{
-        uint8_t r = a - b;
-        return r > 128U ? 0 : r;
-}
-static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k,
-                                unsigned i)
-{
-        return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i));
-}
-static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
-                                 unsigned i)
-{
-        return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i);
-}
-typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *);
-struct bkey *bch_btree_iter_next(struct btree_iter *);
-struct bkey *bch_btree_iter_next_filter(struct btree_iter *,
-                                        struct btree *, ptr_filter_fn);
-void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *);
-struct bkey *__bch_btree_iter_init(struct btree *, struct btree_iter *,
-                                   struct bkey *, struct bset_tree *);
-/* 32 bits total: */
-#define BKEY_MID_BITS           3
-#define BKEY_EXPONENT_BITS      7
-#define BKEY_MANTISSA_BITS      22
-#define BKEY_MANTISSA_MASK      ((1 << BKEY_MANTISSA_BITS) - 1)
-struct bkey_float {
-        unsigned        exponent:BKEY_EXPONENT_BITS;
-        unsigned        m:BKEY_MID_BITS;
-        unsigned        mantissa:BKEY_MANTISSA_BITS;
-} __packed;
-/*
- * BSET_CACHELINE was originally intended to match the hardware cacheline size -
- * it used to be 64, but I realized the lookup code would touch slightly less
- * memory if it was 128.
- *
- * It definites the number of bytes (in struct bset) per struct bkey_float in
- * the auxiliar search tree - when we're done searching the bset_float tree we
- * have this many bytes left that we do a linear search over.
- *
- * Since (after level 5) every level of the bset_tree is on a new cacheline,
- * we're touching one fewer cacheline in the bset tree in exchange for one more
- * cacheline in the linear search - but the linear search might stop before it
- * gets to the second cacheline.
- */
-#define BSET_CACHELINE          128
-#define bset_tree_space(b)      (btree_data_space(b) / BSET_CACHELINE)
-#define bset_tree_bytes(b)      (bset_tree_space(b) * sizeof(struct bkey_float))
+int __bch_count_data(struct btree_keys *);
-#define bset_prev_bytes(b)      (bset_tree_space(b) * sizeof(uint8_t))
+void __bch_check_keys(struct btree_keys *, const char *, ...);
+void bch_dump_bset(struct btree_keys *, struct bset *, unsigned);
+void bch_dump_bucket(struct btree_keys *);
-void bch_bset_init_next(struct btree *);
+#else
-void bch_bset_fix_invalidated_key(struct btree *, struct bkey *);
+static inline int __bch_count_data(struct btree_keys *b) { return -1; }
-void bch_bset_fix_lookup_table(struct btree *, struct bkey *);
+static inline void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) {}
+static inline void bch_dump_bucket(struct btree_keys *b) {}
+void bch_dump_bset(struct btree_keys *, struct bset *, unsigned);
-struct bkey *__bch_bset_search(struct btree *, struct bset_tree *,
+#endif
-                           const struct bkey *);
-/*
+static inline bool btree_keys_expensive_checks(struct btree_keys *b)
- * Returns the first key that is strictly greater than search
- */
-static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t,
-                                           const struct bkey *search)
 {
-        return search ? __bch_bset_search(b, t, search) : t->data->start;
+#ifdef CONFIG_BCACHE_DEBUG
+        return *b->expensive_debug_checks;
+#else
+        return false;
+#endif
 }
-#define PRECEDING_KEY(_k)                                       \
+static inline int bch_count_data(struct btree_keys *b)
-({                                                              \
-        struct bkey *_ret = NULL;                               \
-                                                                \
-        if (KEY_INODE(_k) || KEY_OFFSET(_k)) {                  \
-                _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0);  \
-                                                                \
-                if (!_ret->low)                                 \
-                        _ret->high--;                           \
-                _ret->low--;                                    \
-        }                                                       \
-                                                                \
-        _ret;                                                   \
-})
-bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *);
-void bch_btree_sort_lazy(struct btree *);
-void bch_btree_sort_into(struct btree *, struct btree *);
-void bch_btree_sort_and_fix_extents(struct btree *, struct btree_iter *);
-void bch_btree_sort_partial(struct btree *, unsigned);
-static inline void bch_btree_sort(struct btree *b)
 {
-        bch_btree_sort_partial(b, 0);
+        return btree_keys_expensive_checks(b) ? __bch_count_data(b) : -1;
 }
-int bch_bset_print_stats(struct cache_set *, char *);
+#define bch_check_keys(b, ...)                                          \
+do {                                                                    \
+        if (btree_keys_expensive_checks(b))                             \
+                __bch_check_keys(b, __VA_ARGS__);                       \
+} while (0)
 #endif
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 946ecd3b048b..98cc0a810a36 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -23,7 +23,7 @@
 #include "bcache.h"
 #include "btree.h"
 #include "debug.h"
-#include "writeback.h"
+#include "extents.h"
 #include <linux/slab.h>
 #include <linux/bitops.h>
@@ -89,13 +89,6 @@
 * Test module load/unload
 */
-enum {
-        BTREE_INSERT_STATUS_INSERT,
-        BTREE_INSERT_STATUS_BACK_MERGE,
-        BTREE_INSERT_STATUS_OVERWROTE,
-        BTREE_INSERT_STATUS_FRONT_MERGE,
-};
 #define MAX_NEED_GC             64
 #define MAX_SAVE_PRIO           72
@@ -106,14 +99,6 @@ enum {
 static struct workqueue_struct *btree_io_wq;
-static inline bool should_split(struct btree *b)
-{
-        struct bset *i = write_block(b);
-        return b->written >= btree_blocks(b) ||
-                (b->written + __set_blocks(i, i->keys + 15, b->c)
-                 > btree_blocks(b));
-}
 #define insert_lock(s, b)       ((b)->level <= (s)->lock)
 /*
@@ -167,6 +152,8 @@ static inline bool should_split(struct btree *b)
                        _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);   \
                }                                                       \
                rw_unlock(_w, _b);                                      \
+                if (_r == -EINTR)                                       \
+                        schedule();                                     \
                bch_cannibalize_unlock(c);                              \
                if (_r == -ENOSPC) {                                    \
                        wait_event((c)->try_wait,                       \
@@ -175,9 +162,15 @@ static inline bool should_split(struct btree *b)
                }                                                       \
        } while (_r == -EINTR);                                         \
                                                                        \
+        finish_wait(&(c)->bucket_wait, &(op)->wait);                    \
        _r;                                                             \
 })
+static inline struct bset *write_block(struct btree *b)
+{
+        return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c);
+}
 /* Btree key manipulation */
 void bkey_put(struct cache_set *c, struct bkey *k)
@@ -194,16 +187,16 @@ void bkey_put(struct cache_set *c, struct bkey *k)
 static uint64_t btree_csum_set(struct btree *b, struct bset *i)
 {
        uint64_t crc = b->key.ptr[0];
-        void *data = (void *) i + 8, *end = end(i);
+        void *data = (void *) i + 8, *end = bset_bkey_last(i);
        crc = bch_crc64_update(crc, data, end - data);
        return crc ^ 0xffffffffffffffffULL;
 }
-static void bch_btree_node_read_done(struct btree *b)
+void bch_btree_node_read_done(struct btree *b)
 {
        const char *err = "bad btree header";
-        struct bset *i = b->sets[0].data;
+        struct bset *i = btree_bset_first(b);
        struct btree_iter *iter;
        iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
@@ -211,21 +204,22 @@ static void bch_btree_node_read_done(struct btree *b)
        iter->used = 0;
 #ifdef CONFIG_BCACHE_DEBUG
-        iter->b = b;
+        iter->b = &b->keys;
 #endif
        if (!i->seq)
                goto err;
        for (;
-             b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq;
+             b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq;
             i = write_block(b)) {
                err = "unsupported bset version";
                if (i->version > BCACHE_BSET_VERSION)
                        goto err;
                err = "bad btree header";
-                if (b->written + set_blocks(i, b->c) > btree_blocks(b))
+                if (b->written + set_blocks(i, block_bytes(b->c)) >
+                    btree_blocks(b))
                        goto err;
                err = "bad magic";
@@ -245,39 +239,40 @@ static void bch_btree_node_read_done(struct btree *b)
                }
                err = "empty set";
-                if (i != b->sets[0].data && !i->keys)
+                if (i != b->keys.set[0].data && !i->keys)
                        goto err;
-                bch_btree_iter_push(iter, i->start, end(i));
+                bch_btree_iter_push(iter, i->start, bset_bkey_last(i));
-                b->written += set_blocks(i, b->c);
+                b->written += set_blocks(i, block_bytes(b->c));
        }
        err = "corrupted btree";
        for (i = write_block(b);
-             index(i, b) < btree_blocks(b);
+             bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key);
             i = ((void *) i) + block_bytes(b->c))
-                if (i->seq == b->sets[0].data->seq)
+                if (i->seq == b->keys.set[0].data->seq)
                        goto err;
-        bch_btree_sort_and_fix_extents(b, iter);
+        bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort);
-        i = b->sets[0].data;
+        i = b->keys.set[0].data;
        err = "short btree key";
-        if (b->sets[0].size &&
+        if (b->keys.set[0].size &&
-            bkey_cmp(&b->key, &b->sets[0].end) < 0)
+            bkey_cmp(&b->key, &b->keys.set[0].end) < 0)
                goto err;
        if (b->written < btree_blocks(b))
-                bch_bset_init_next(b);
+                bch_bset_init_next(&b->keys, write_block(b),
+                                   bset_magic(&b->c->sb));
 out:
        mempool_free(iter, b->c->fill_iter);
        return;
 err:
        set_btree_node_io_error(b);
-        bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys",
+        bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys",
                            err, PTR_BUCKET_NR(b->c, &b->key, 0),
-                            index(i, b), i->keys);
+                            bset_block_offset(b, i), i->keys);
        goto out;
 }
@@ -287,7 +282,7 @@ static void btree_node_read_endio(struct bio *bio, int error)
        closure_put(cl);
 }
-void bch_btree_node_read(struct btree *b)
+static void bch_btree_node_read(struct btree *b)
 {
        uint64_t start_time = local_clock();
        struct closure cl;
@@ -303,7 +298,7 @@ void bch_btree_node_read(struct btree *b)
        bio->bi_end_io  = btree_node_read_endio;
        bio->bi_private = &cl;
-        bch_bio_map(bio, b->sets[0].data);
+        bch_bio_map(bio, b->keys.set[0].data);
        bch_submit_bbio(bio, b->c, &b->key, 0);
        closure_sync(&cl);
@@ -340,9 +335,16 @@ static void btree_complete_write(struct btree *b, struct btree_write *w)
        w->journal      = NULL;
 }
+static void btree_node_write_unlock(struct closure *cl)
+{
+        struct btree *b = container_of(cl, struct btree, io);
+        up(&b->io_mutex);
+}
 static void __btree_node_write_done(struct closure *cl)
 {
-        struct btree *b = container_of(cl, struct btree, io.cl);
+        struct btree *b = container_of(cl, struct btree, io);
        struct btree_write *w = btree_prev_write(b);
        bch_bbio_free(b->bio, b->c);
@@ -353,12 +355,12 @@ static void __btree_node_write_done(struct closure *cl)
                queue_delayed_work(btree_io_wq, &b->work,
                                   msecs_to_jiffies(30000));
-        closure_return(cl);
+        closure_return_with_destructor(cl, btree_node_write_unlock);
 }
 static void btree_node_write_done(struct closure *cl)
 {
-        struct btree *b = container_of(cl, struct btree, io.cl);
+        struct btree *b = container_of(cl, struct btree, io);
        struct bio_vec *bv;
        int n;
@@ -371,7 +373,7 @@ static void btree_node_write_done(struct closure *cl)
 static void btree_node_write_endio(struct bio *bio, int error)
 {
        struct closure *cl = bio->bi_private;
-        struct btree *b = container_of(cl, struct btree, io.cl);
+        struct btree *b = container_of(cl, struct btree, io);
        if (error)
                set_btree_node_io_error(b);
@@ -382,8 +384,8 @@ static void btree_node_write_endio(struct bio *bio, int error)
 static void do_btree_node_write(struct btree *b)
 {
-        struct closure *cl = &b->io.cl;
+        struct closure *cl = &b->io;
-        struct bset *i = b->sets[b->nsets].data;
+        struct bset *i = btree_bset_last(b);
        BKEY_PADDED(key) k;
        i->version      = BCACHE_BSET_VERSION;
@@ -395,7 +397,7 @@ static void do_btree_node_write(struct btree *b)
        b->bio->bi_end_io       = btree_node_write_endio;
        b->bio->bi_private      = cl;
        b->bio->bi_rw           = REQ_META|WRITE_SYNC|REQ_FUA;
-        b->bio->bi_iter.bi_size = set_blocks(i, b->c) * block_bytes(b->c);
+        b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c));
        bch_bio_map(b->bio, i);
        /*
@@ -414,7 +416,8 @@ static void do_btree_node_write(struct btree *b)
         */
        bkey_copy(&k.key, &b->key);
-        SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i));
+        SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
+                       bset_sector_offset(&b->keys, i));
        if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
                int j;
@@ -435,40 +438,54 @@ static void do_btree_node_write(struct btree *b)
                bch_submit_bbio(b->bio, b->c, &k.key, 0);
                closure_sync(cl);
-                __btree_node_write_done(cl);
+                continue_at_nobarrier(cl, __btree_node_write_done, NULL);
        }
 }
 void bch_btree_node_write(struct btree *b, struct closure *parent)
 {
-        struct bset *i = b->sets[b->nsets].data;
+        struct bset *i = btree_bset_last(b);
        trace_bcache_btree_write(b);
        BUG_ON(current->bio_list);
        BUG_ON(b->written >= btree_blocks(b));
        BUG_ON(b->written && !i->keys);
-        BUG_ON(b->sets->data->seq != i->seq);
+        BUG_ON(btree_bset_first(b)->seq != i->seq);
-        bch_check_keys(b, "writing");
+        bch_check_keys(&b->keys, "writing");
        cancel_delayed_work(&b->work);
        /* If caller isn't waiting for write, parent refcount is cache set */
-        closure_lock(&b->io, parent ?: &b->c->cl);
+        down(&b->io_mutex);
+        closure_init(&b->io, parent ?: &b->c->cl);
        clear_bit(BTREE_NODE_dirty,      &b->flags);
        change_bit(BTREE_NODE_write_idx, &b->flags);
        do_btree_node_write(b);
-        b->written += set_blocks(i, b->c);
+        atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size,
-        atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size,
                        &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
-        bch_btree_sort_lazy(b);
+        b->written += set_blocks(i, block_bytes(b->c));
+        /* If not a leaf node, always sort */
+        if (b->level && b->keys.nsets)
+                bch_btree_sort(&b->keys, &b->c->sort);
+        else
+                bch_btree_sort_lazy(&b->keys, &b->c->sort);
+        /*
+         * do verify if there was more than one set initially (i.e. we did a
+         * sort) and we sorted down to a single set:
+         */
+        if (i != b->keys.set->data && !b->keys.nsets)
+                bch_btree_verify(b);
        if (b->written < btree_blocks(b))
-                bch_bset_init_next(b);
+                bch_bset_init_next(&b->keys, write_block(b),
+                                   bset_magic(&b->c->sb));
 }
 static void bch_btree_node_write_sync(struct btree *b)
@@ -493,7 +510,7 @@ static void btree_node_write_work(struct work_struct *w)
 static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
 {
-        struct bset *i = b->sets[b->nsets].data;
+        struct bset *i = btree_bset_last(b);
        struct btree_write *w = btree_current_write(b);
        BUG_ON(!b->written);
@@ -528,24 +545,6 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
 * mca -> memory cache
 */
-static void mca_reinit(struct btree *b)
-{
-        unsigned i;
-        b->flags        = 0;
-        b->written      = 0;
-        b->nsets        = 0;
-        for (i = 0; i < MAX_BSETS; i++)
-                b->sets[i].size = 0;
-        /*
-         * Second loop starts at 1 because b->sets[0]->data is the memory we
-         * allocated
-         */
-        for (i = 1; i < MAX_BSETS; i++)
-                b->sets[i].data = NULL;
-}
 #define mca_reserve(c)  (((c->root && c->root->level)           \
                          ? c->root->level : 1) * 8 + 16)
 #define mca_can_free(c)                                         \
@@ -553,28 +552,12 @@ static void mca_reinit(struct btree *b)
 static void mca_data_free(struct btree *b)
 {
-        struct bset_tree *t = b->sets;
+        BUG_ON(b->io_mutex.count != 1);
-        BUG_ON(!closure_is_unlocked(&b->io.cl));
-        if (bset_prev_bytes(b) < PAGE_SIZE)
+        bch_btree_keys_free(&b->keys);
-                kfree(t->prev);
-        else
-                free_pages((unsigned long) t->prev,
-                           get_order(bset_prev_bytes(b)));
-        if (bset_tree_bytes(b) < PAGE_SIZE)
-                kfree(t->tree);
-        else
-                free_pages((unsigned long) t->tree,
-                           get_order(bset_tree_bytes(b)));
-        free_pages((unsigned long) t->data, b->page_order);
-        t->prev = NULL;
-        t->tree = NULL;
-        t->data = NULL;
-        list_move(&b->list, &b->c->btree_cache_freed);
        b->c->bucket_cache_used--;
+        list_move(&b->list, &b->c->btree_cache_freed);
 }
 static void mca_bucket_free(struct btree *b)
@@ -593,34 +576,16 @@ static unsigned btree_order(struct bkey *k)
 static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
 {
-        struct bset_tree *t = b->sets;
+        if (!bch_btree_keys_alloc(&b->keys,
-        BUG_ON(t->data);
+                                  max_t(unsigned,
+                                        ilog2(b->c->btree_pages),
-        b->page_order = max_t(unsigned,
+                                        btree_order(k)),
-                              ilog2(b->c->btree_pages),
+                                  gfp)) {
-                              btree_order(k));
+                b->c->bucket_cache_used++;
+                list_move(&b->list, &b->c->btree_cache);
-        t->data = (void *) __get_free_pages(gfp, b->page_order);
+        } else {
-        if (!t->data)
+                list_move(&b->list, &b->c->btree_cache_freed);
-                goto err;
+        }
-        t->tree = bset_tree_bytes(b) < PAGE_SIZE
-                ? kmalloc(bset_tree_bytes(b), gfp)
-                : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
-        if (!t->tree)
-                goto err;
-        t->prev = bset_prev_bytes(b) < PAGE_SIZE
-                ? kmalloc(bset_prev_bytes(b), gfp)
-                : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
-        if (!t->prev)
-                goto err;
-        list_move(&b->list, &b->c->btree_cache);
-        b->c->bucket_cache_used++;
-        return;
-err:
-        mca_data_free(b);
 }
 static struct btree *mca_bucket_alloc(struct cache_set *c,
@@ -635,7 +600,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
        INIT_LIST_HEAD(&b->list);
        INIT_DELAYED_WORK(&b->work, btree_node_write_work);
        b->c = c;
-        closure_init_unlocked(&b->io);
+        sema_init(&b->io_mutex, 1);
        mca_data_alloc(b, k, gfp);
        return b;
@@ -651,24 +616,31 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush)
        if (!down_write_trylock(&b->lock))
                return -ENOMEM;
-        BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
+        BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data);
-        if (b->page_order < min_order ||
+        if (b->keys.page_order < min_order)
-            (!flush &&
+                goto out_unlock;
-             (btree_node_dirty(b) ||
-              atomic_read(&b->io.cl.remaining) != -1))) {
+        if (!flush) {
-                rw_unlock(true, b);
+                if (btree_node_dirty(b))
-                return -ENOMEM;
+                        goto out_unlock;
+                if (down_trylock(&b->io_mutex))
+                        goto out_unlock;
+                up(&b->io_mutex);
        }
        if (btree_node_dirty(b))
                bch_btree_node_write_sync(b);
        /* wait for any in flight btree write */
-        closure_wait_event(&b->io.wait, &cl,
+        down(&b->io_mutex);
-                           atomic_read(&b->io.cl.remaining) == -1);
+        up(&b->io_mutex);
        return 0;
+out_unlock:
+        rw_unlock(true, b);
+        return -ENOMEM;
 }
 static unsigned long bch_mca_scan(struct shrinker *shrink,
@@ -714,14 +686,10 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
                }
        }
-        /*
-         * Can happen right when we first start up, before we've read in any
-         * btree nodes
-         */
-        if (list_empty(&c->btree_cache))
-                goto out;
        for (i = 0; (nr--) && i < c->bucket_cache_used; i++) {
+                if (list_empty(&c->btree_cache))
+                        goto out;
                b = list_first_entry(&c->btree_cache, struct btree, list);
                list_rotate_left(&c->btree_cache);
@@ -767,6 +735,8 @@ void bch_btree_cache_free(struct cache_set *c)
 #ifdef CONFIG_BCACHE_DEBUG
        if (c->verify_data)
                list_move(&c->verify_data->list, &c->btree_cache);
+        free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c)));
 #endif
        list_splice(&c->btree_cache_freeable,
@@ -807,10 +777,13 @@ int bch_btree_cache_alloc(struct cache_set *c)
 #ifdef CONFIG_BCACHE_DEBUG
        mutex_init(&c->verify_lock);
+        c->verify_ondisk = (void *)
+                __get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c)));
        c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
        if (c->verify_data &&
-            c->verify_data->sets[0].data)
+            c->verify_data->keys.set->data)
                list_del_init(&c->verify_data->list);
        else
                c->verify_data = NULL;
@@ -908,7 +881,7 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
        list_for_each_entry(b, &c->btree_cache_freed, list)
                if (!mca_reap(b, 0, false)) {
                        mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
-                        if (!b->sets[0].data)
+                        if (!b->keys.set[0].data)
                                goto err;
                        else
                                goto out;
@@ -919,10 +892,10 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
                goto err;
        BUG_ON(!down_write_trylock(&b->lock));
-        if (!b->sets->data)
+        if (!b->keys.set->data)
                goto err;
 out:
-        BUG_ON(!closure_is_unlocked(&b->io.cl));
+        BUG_ON(b->io_mutex.count != 1);
        bkey_copy(&b->key, k);
        list_move(&b->list, &c->btree_cache);
@@ -930,10 +903,17 @@ out:
        hlist_add_head_rcu(&b->hash, mca_hash(c, k));
        lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
-        b->level        = level;
        b->parent       = (void *) ~0UL;
+        b->flags        = 0;
+        b->written      = 0;
+        b->level        = level;
-        mca_reinit(b);
+        if (!b->level)
+                bch_btree_keys_init(&b->keys, &bch_extent_keys_ops,
+                                    &b->c->expensive_debug_checks);
+        else
+                bch_btree_keys_init(&b->keys, &bch_btree_keys_ops,
+                                    &b->c->expensive_debug_checks);
        return b;
 err:
@@ -994,13 +974,13 @@ retry:
        b->accessed = 1;
-        for (; i <= b->nsets && b->sets[i].size; i++) {
+        for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
-                prefetch(b->sets[i].tree);
+                prefetch(b->keys.set[i].tree);
-                prefetch(b->sets[i].data);
+                prefetch(b->keys.set[i].data);
        }
-        for (; i <= b->nsets; i++)
+        for (; i <= b->keys.nsets; i++)
-                prefetch(b->sets[i].data);
+                prefetch(b->keys.set[i].data);
        if (btree_node_io_error(b)) {
                rw_unlock(write, b);
@@ -1063,7 +1043,7 @@ struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait)
        mutex_lock(&c->bucket_lock);
 retry:
-        if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, wait))
+        if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait))
                goto err;
        bkey_put(c, &k.key);
@@ -1080,7 +1060,7 @@ retry:
        }
        b->accessed = 1;
-        bch_bset_init_next(b);
+        bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb));
        mutex_unlock(&c->bucket_lock);
@@ -1098,8 +1078,10 @@ err:
 static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait)
 {
        struct btree *n = bch_btree_node_alloc(b->c, b->level, wait);
-        if (!IS_ERR_OR_NULL(n))
+        if (!IS_ERR_OR_NULL(n)) {
-                bch_btree_sort_into(b, n);
+                bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
+                bkey_copy_key(&n->key, &b->key);
+        }
        return n;
 }
@@ -1120,6 +1102,28 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k)
        atomic_inc(&b->c->prio_blocked);
 }
+static int btree_check_reserve(struct btree *b, struct btree_op *op)
+{
+        struct cache_set *c = b->c;
+        struct cache *ca;
+        unsigned i, reserve = c->root->level * 2 + 1;
+        int ret = 0;
+        mutex_lock(&c->bucket_lock);
+        for_each_cache(ca, c, i)
+                if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
+                        if (op)
+                                prepare_to_wait(&c->bucket_wait, &op->wait,
+                                                TASK_UNINTERRUPTIBLE);
+                        ret = -EINTR;
+                        break;
+                }
+        mutex_unlock(&c->bucket_lock);
+        return ret;
+}
 /* Garbage collection */
 uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
@@ -1183,11 +1187,11 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
        gc->nodes++;
-        for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
+        for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
                stale = max(stale, btree_mark_key(b, k));
                keys++;
-                if (bch_ptr_bad(b, k))
+                if (bch_ptr_bad(&b->keys, k))
                        continue;
                gc->key_bytes += bkey_u64s(k);
@@ -1197,9 +1201,9 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
                gc->data += KEY_SIZE(k);
        }
-        for (t = b->sets; t <= &b->sets[b->nsets]; t++)
+        for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++)
                btree_bug_on(t->size &&
-                             bset_written(b, t) &&
+                             bset_written(&b->keys, t) &&
                             bkey_cmp(&b->key, &t->end) < 0,
                             b, "found short btree key in gc");
@@ -1243,7 +1247,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
        blocks = btree_default_blocks(b->c) * 2 / 3;
        if (nodes < 2 ||
-            __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1))
+            __set_blocks(b->keys.set[0].data, keys,
+                         block_bytes(b->c)) > blocks * (nodes - 1))
                return 0;
        for (i = 0; i < nodes; i++) {
@@ -1253,18 +1258,19 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
        }
        for (i = nodes - 1; i > 0; --i) {
-                struct bset *n1 = new_nodes[i]->sets->data;
+                struct bset *n1 = btree_bset_first(new_nodes[i]);
-                struct bset *n2 = new_nodes[i - 1]->sets->data;
+                struct bset *n2 = btree_bset_first(new_nodes[i - 1]);
                struct bkey *k, *last = NULL;
                keys = 0;
                if (i > 1) {
                        for (k = n2->start;
-                             k < end(n2);
+                             k < bset_bkey_last(n2);
                             k = bkey_next(k)) {
                                if (__set_blocks(n1, n1->keys + keys +
-                                                 bkey_u64s(k), b->c) > blocks)
+                                                 bkey_u64s(k),
+                                                 block_bytes(b->c)) > blocks)
                                        break;
                                last = k;
@@ -1280,7 +1286,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
                         * though)
                         */
                        if (__set_blocks(n1, n1->keys + n2->keys,
-                                         b->c) > btree_blocks(new_nodes[i]))
+                                         block_bytes(b->c)) >
+                            btree_blocks(new_nodes[i]))
                                goto out_nocoalesce;
                        keys = n2->keys;
@@ -1288,27 +1295,28 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
                        last = &r->b->key;
                }
-                BUG_ON(__set_blocks(n1, n1->keys + keys,
+                BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) >
-                                    b->c) > btree_blocks(new_nodes[i]));
+                       btree_blocks(new_nodes[i]));
                if (last)
                        bkey_copy_key(&new_nodes[i]->key, last);
-                memcpy(end(n1),
+                memcpy(bset_bkey_last(n1),
                       n2->start,
-                       (void *) node(n2, keys) - (void *) n2->start);
+                       (void *) bset_bkey_idx(n2, keys) - (void *) n2->start);
                n1->keys += keys;
                r[i].keys = n1->keys;
                memmove(n2->start,
-                        node(n2, keys),
+                        bset_bkey_idx(n2, keys),
-                        (void *) end(n2) - (void *) node(n2, keys));
+                        (void *) bset_bkey_last(n2) -
+                        (void *) bset_bkey_idx(n2, keys));
                n2->keys -= keys;
-                if (bch_keylist_realloc(keylist,
+                if (__bch_keylist_realloc(keylist,
-                                        KEY_PTRS(&new_nodes[i]->key), b->c))
+                                          bkey_u64s(&new_nodes[i]->key)))
                        goto out_nocoalesce;
                bch_btree_node_write(new_nodes[i], &cl);
@@ -1316,7 +1324,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
        }
        for (i = 0; i < nodes; i++) {
-                if (bch_keylist_realloc(keylist, KEY_PTRS(&r[i].b->key), b->c))
+                if (__bch_keylist_realloc(keylist, bkey_u64s(&r[i].b->key)))
                        goto out_nocoalesce;
                make_btree_freeing_key(r[i].b, keylist->top);
@@ -1324,7 +1332,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
        }
        /* We emptied out this node */
-        BUG_ON(new_nodes[0]->sets->data->keys);
+        BUG_ON(btree_bset_first(new_nodes[0])->keys);
        btree_node_free(new_nodes[0]);
        rw_unlock(true, new_nodes[0]);
@@ -1370,7 +1378,7 @@ static unsigned btree_gc_count_keys(struct btree *b)
        struct btree_iter iter;
        unsigned ret = 0;
-        for_each_key_filter(b, k, &iter, bch_ptr_bad)
+        for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
                ret += bkey_u64s(k);
        return ret;
@@ -1390,13 +1398,13 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
        struct gc_merge_info *last = r + GC_MERGE_NODES - 1;
        bch_keylist_init(&keys);
-        bch_btree_iter_init(b, &iter, &b->c->gc_done);
+        bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done);
        for (i = 0; i < GC_MERGE_NODES; i++)
                r[i].b = ERR_PTR(-EINTR);
        while (1) {
-                k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
+                k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
                if (k) {
                        r->b = bch_btree_node_get(b->c, k, b->level - 1, true);
                        if (IS_ERR(r->b)) {
@@ -1416,7 +1424,8 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
                if (!IS_ERR(last->b)) {
                        should_rewrite = btree_gc_mark_node(last->b, gc);
-                        if (should_rewrite) {
+                        if (should_rewrite &&
+                            !btree_check_reserve(b, NULL)) {
                                n = btree_node_alloc_replacement(last->b,
                                                                 false);
@@ -1705,7 +1714,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
        struct bucket *g;
        struct btree_iter iter;
-        for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
+        for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
                for (i = 0; i < KEY_PTRS(k); i++) {
                        if (!ptr_available(b->c, k, i))
                                continue;
@@ -1728,10 +1737,11 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
        }
        if (b->level) {
-                bch_btree_iter_init(b, &iter, NULL);
+                bch_btree_iter_init(&b->keys, &iter, NULL);
                do {
-                        k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
+                        k = bch_btree_iter_next_filter(&iter, &b->keys,
+                                                       bch_ptr_bad);
                        if (k)
                                btree_node_prefetch(b->c, k, b->level - 1);
@@ -1774,235 +1784,36 @@ err:
 /* Btree insertion */
-static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert)
+static bool btree_insert_key(struct btree *b, struct bkey *k,
-{
+                             struct bkey *replace_key)
-        struct bset *i = b->sets[b->nsets].data;
-        memmove((uint64_t *) where + bkey_u64s(insert),
-                where,
-                (void *) end(i) - (void *) where);
-        i->keys += bkey_u64s(insert);
-        bkey_copy(where, insert);
-        bch_bset_fix_lookup_table(b, where);
-}
-static bool fix_overlapping_extents(struct btree *b, struct bkey *insert,
-                                    struct btree_iter *iter,
-                                    struct bkey *replace_key)
 {
-        void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
+        unsigned status;
-        {
-                if (KEY_DIRTY(k))
-                        bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
-                                                     offset, -sectors);
-        }
-        uint64_t old_offset;
-        unsigned old_size, sectors_found = 0;
-        while (1) {
-                struct bkey *k = bch_btree_iter_next(iter);
-                if (!k ||
-                    bkey_cmp(&START_KEY(k), insert) >= 0)
-                        break;
-                if (bkey_cmp(k, &START_KEY(insert)) <= 0)
-                        continue;
-                old_offset = KEY_START(k);
-                old_size = KEY_SIZE(k);
-                /*
-                 * We might overlap with 0 size extents; we can't skip these
-                 * because if they're in the set we're inserting to we have to
-                 * adjust them so they don't overlap with the key we're
-                 * inserting. But we don't want to check them for replace
-                 * operations.
-                 */
-                if (replace_key && KEY_SIZE(k)) {
-                        /*
-                         * k might have been split since we inserted/found the
-                         * key we're replacing
-                         */
-                        unsigned i;
-                        uint64_t offset = KEY_START(k) -
-                                KEY_START(replace_key);
-                        /* But it must be a subset of the replace key */
-                        if (KEY_START(k) < KEY_START(replace_key) ||
-                            KEY_OFFSET(k) > KEY_OFFSET(replace_key))
-                                goto check_failed;
-                        /* We didn't find a key that we were supposed to */
-                        if (KEY_START(k) > KEY_START(insert) + sectors_found)
-                                goto check_failed;
-                        if (KEY_PTRS(k) != KEY_PTRS(replace_key) ||
-                            KEY_DIRTY(k) != KEY_DIRTY(replace_key))
-                                goto check_failed;
-                        /* skip past gen */
-                        offset <<= 8;
-                        BUG_ON(!KEY_PTRS(replace_key));
-                        for (i = 0; i < KEY_PTRS(replace_key); i++)
+        BUG_ON(bkey_cmp(k, &b->key) > 0);
-                                if (k->ptr[i] != replace_key->ptr[i] + offset)
-                                        goto check_failed;
-                        sectors_found = KEY_OFFSET(k) - KEY_START(insert);
-                }
-                if (bkey_cmp(insert, k) < 0 &&
-                    bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
-                        /*
-                         * We overlapped in the middle of an existing key: that
-                         * means we have to split the old key. But we have to do
-                         * slightly different things depending on whether the
-                         * old key has been written out yet.
-                         */
-                        struct bkey *top;
-                        subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
-                        if (bkey_written(b, k)) {
-                                /*
-                                 * We insert a new key to cover the top of the
-                                 * old key, and the old key is modified in place
-                                 * to represent the bottom split.
-                                 *
-                                 * It's completely arbitrary whether the new key
-                                 * is the top or the bottom, but it has to match
-                                 * up with what btree_sort_fixup() does - it
-                                 * doesn't check for this kind of overlap, it
-                                 * depends on us inserting a new key for the top
-                                 * here.
-                                 */
-                                top = bch_bset_search(b, &b->sets[b->nsets],
-                                                      insert);
-                                shift_keys(b, top, k);
-                        } else {
-                                BKEY_PADDED(key) temp;
-                                bkey_copy(&temp.key, k);
-                                shift_keys(b, k, &temp.key);
-                                top = bkey_next(k);
-                        }
-                        bch_cut_front(insert, top);
-                        bch_cut_back(&START_KEY(insert), k);
-                        bch_bset_fix_invalidated_key(b, k);
-                        return false;
-                }
-                if (bkey_cmp(insert, k) < 0) {
-                        bch_cut_front(insert, k);
-                } else {
-                        if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
-                                old_offset = KEY_START(insert);
-                        if (bkey_written(b, k) &&
-                            bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
-                                /*
-                                 * Completely overwrote, so we don't have to
-                                 * invalidate the binary search tree
-                                 */
-                                bch_cut_front(k, k);
-                        } else {
-                                __bch_cut_back(&START_KEY(insert), k);
-                                bch_bset_fix_invalidated_key(b, k);
-                        }
-                }
-                subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
-        }
-check_failed:
+        status = bch_btree_insert_key(&b->keys, k, replace_key);
-        if (replace_key) {
+        if (status != BTREE_INSERT_STATUS_NO_INSERT) {
-                if (!sectors_found) {
+                bch_check_keys(&b->keys, "%u for %s", status,
-                        return true;
+                               replace_key ? "replace" : "insert");
-                } else if (sectors_found < KEY_SIZE(insert)) {
-                        SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
-                                       (KEY_SIZE(insert) - sectors_found));
-                        SET_KEY_SIZE(insert, sectors_found);
-                }
-        }
-        return false;
+                trace_bcache_btree_insert_key(b, k, replace_key != NULL,
+                                              status);
+                return true;
+        } else
+                return false;
 }
-static bool btree_insert_key(struct btree *b, struct btree_op *op,
+static size_t insert_u64s_remaining(struct btree *b)
-                             struct bkey *k, struct bkey *replace_key)
 {
-        struct bset *i = b->sets[b->nsets].data;
+        ssize_t ret = bch_btree_keys_u64s_remaining(&b->keys);
-        struct bkey *m, *prev;
-        unsigned status = BTREE_INSERT_STATUS_INSERT;
-        BUG_ON(bkey_cmp(k, &b->key) > 0);
-        BUG_ON(b->level && !KEY_PTRS(k));
-        BUG_ON(!b->level && !KEY_OFFSET(k));
-        if (!b->level) {
-                struct btree_iter iter;
-                /*
-                 * bset_search() returns the first key that is strictly greater
-                 * than the search key - but for back merging, we want to find
-                 * the previous key.
-                 */
-                prev = NULL;
-                m = bch_btree_iter_init(b, &iter, PRECEDING_KEY(&START_KEY(k)));
-                if (fix_overlapping_extents(b, k, &iter, replace_key)) {
+        /*
-                        op->insert_collision = true;
+         * Might land in the middle of an existing extent and have to split it
-                        return false;
+         */
-                }
+        if (b->keys.ops->is_extents)
+                ret -= KEY_MAX_U64S;
-                if (KEY_DIRTY(k))
-                        bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
-                                                     KEY_START(k), KEY_SIZE(k));
-                while (m != end(i) &&
-                       bkey_cmp(k, &START_KEY(m)) > 0)
-                        prev = m, m = bkey_next(m);
-                if (key_merging_disabled(b->c))
-                        goto insert;
-                /* prev is in the tree, if we merge we're done */
-                status = BTREE_INSERT_STATUS_BACK_MERGE;
-                if (prev &&
-                    bch_bkey_try_merge(b, prev, k))
-                        goto merged;
-                status = BTREE_INSERT_STATUS_OVERWROTE;
-                if (m != end(i) &&
-                    KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
-                        goto copy;
-                status = BTREE_INSERT_STATUS_FRONT_MERGE;
-                if (m != end(i) &&
-                    bch_bkey_try_merge(b, k, m))
-                        goto copy;
-        } else {
-                BUG_ON(replace_key);
-                m = bch_bset_search(b, &b->sets[b->nsets], k);
-        }
-insert: shift_keys(b, m, k);
-copy:   bkey_copy(m, k);
-merged:
-        bch_check_keys(b, "%u for %s", status,
-                       replace_key ? "replace" : "insert");
-        if (b->level && !KEY_OFFSET(k))
-                btree_current_write(b)->prio_blocked++;
-        trace_bcache_btree_insert_key(b, k, replace_key != NULL, status);
-        return true;
+        return max(ret, 0L);
 }
 static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
@@ -2010,21 +1821,19 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
                                  struct bkey *replace_key)
 {
        bool ret = false;
-        int oldsize = bch_count_data(b);
+        int oldsize = bch_count_data(&b->keys);
        while (!bch_keylist_empty(insert_keys)) {
-                struct bset *i = write_block(b);
                struct bkey *k = insert_keys->keys;
-                if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c)
+                if (bkey_u64s(k) > insert_u64s_remaining(b))
-                    > btree_blocks(b))
                        break;
                if (bkey_cmp(k, &b->key) <= 0) {
                        if (!b->level)
                                bkey_put(b->c, k);
-                        ret |= btree_insert_key(b, op, k, replace_key);
+                        ret |= btree_insert_key(b, k, replace_key);
                        bch_keylist_pop_front(insert_keys);
                } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) {
                        BKEY_PADDED(key) temp;
@@ -2033,16 +1842,19 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
                        bch_cut_back(&b->key, &temp.key);
                        bch_cut_front(&b->key, insert_keys->keys);
-                        ret |= btree_insert_key(b, op, &temp.key, replace_key);
+                        ret |= btree_insert_key(b, &temp.key, replace_key);
                        break;
                } else {
                        break;
                }
        }
+        if (!ret)
+                op->insert_collision = true;
        BUG_ON(!bch_keylist_empty(insert_keys) && b->level);
-        BUG_ON(bch_count_data(b) < oldsize);
+        BUG_ON(bch_count_data(&b->keys) < oldsize);
        return ret;
 }
@@ -2059,16 +1871,21 @@ static int btree_split(struct btree *b, struct btree_op *op,
        closure_init_stack(&cl);
        bch_keylist_init(&parent_keys);
+        if (!b->level &&
+            btree_check_reserve(b, op))
+                return -EINTR;
        n1 = btree_node_alloc_replacement(b, true);
        if (IS_ERR(n1))
                goto err;
-        split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5;
+        split = set_blocks(btree_bset_first(n1),
+                           block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5;
        if (split) {
                unsigned keys = 0;
-                trace_bcache_btree_node_split(b, n1->sets[0].data->keys);
+                trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys);
                n2 = bch_btree_node_alloc(b->c, b->level, true);
                if (IS_ERR(n2))
@@ -2087,18 +1904,20 @@ static int btree_split(struct btree *b, struct btree_op *op,
                 * search tree yet
                 */
-                while (keys < (n1->sets[0].data->keys * 3) / 5)
+                while (keys < (btree_bset_first(n1)->keys * 3) / 5)
-                        keys += bkey_u64s(node(n1->sets[0].data, keys));
+                        keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1),
+                                                        keys));
-                bkey_copy_key(&n1->key, node(n1->sets[0].data, keys));
+                bkey_copy_key(&n1->key,
-                keys += bkey_u64s(node(n1->sets[0].data, keys));
+                              bset_bkey_idx(btree_bset_first(n1), keys));
+                keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys));
-                n2->sets[0].data->keys = n1->sets[0].data->keys - keys;
+                btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys;
-                n1->sets[0].data->keys = keys;
+                btree_bset_first(n1)->keys = keys;
-                memcpy(n2->sets[0].data->start,
+                memcpy(btree_bset_first(n2)->start,
-                       end(n1->sets[0].data),
+                       bset_bkey_last(btree_bset_first(n1)),
-                       n2->sets[0].data->keys * sizeof(uint64_t));
+                       btree_bset_first(n2)->keys * sizeof(uint64_t));
                bkey_copy_key(&n2->key, &b->key);
@@ -2106,7 +1925,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
                bch_btree_node_write(n2, &cl);
                rw_unlock(true, n2);
        } else {
-                trace_bcache_btree_node_compact(b, n1->sets[0].data->keys);
+                trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys);
                bch_btree_insert_keys(n1, op, insert_keys, replace_key);
        }
@@ -2149,18 +1968,21 @@ static int btree_split(struct btree *b, struct btree_op *op,
        return 0;
 err_free2:
+        bkey_put(b->c, &n2->key);
        btree_node_free(n2);
        rw_unlock(true, n2);
 err_free1:
+        bkey_put(b->c, &n1->key);
        btree_node_free(n1);
        rw_unlock(true, n1);
 err:
+        WARN(1, "bcache: btree split failed");
        if (n3 == ERR_PTR(-EAGAIN) ||
            n2 == ERR_PTR(-EAGAIN) ||
            n1 == ERR_PTR(-EAGAIN))
                return -EAGAIN;
-        pr_warn("couldn't split");
        return -ENOMEM;
 }
@@ -2171,7 +1993,7 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
 {
        BUG_ON(b->level && replace_key);
-        if (should_split(b)) {
+        if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) {
                if (current->bio_list) {
                        op->lock = b->c->root->level + 1;
                        return -EAGAIN;
@@ -2180,11 +2002,13 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
                        return -EINTR;
                } else {
                        /* Invalidated all iterators */
-                        return btree_split(b, op, insert_keys, replace_key) ?:
+                        int ret = btree_split(b, op, insert_keys, replace_key);
-                                -EINTR;
+                        return bch_keylist_empty(insert_keys) ?
+                                0 : ret ?: -EINTR;
                }
        } else {
-                BUG_ON(write_block(b) != b->sets[b->nsets].data);
+                BUG_ON(write_block(b) != btree_bset_last(b));
                if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
                        if (!b->level)
@@ -2323,9 +2147,9 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
                struct bkey *k;
                struct btree_iter iter;
-                bch_btree_iter_init(b, &iter, from);
+                bch_btree_iter_init(&b->keys, &iter, from);
-                while ((k = bch_btree_iter_next_filter(&iter, b,
+                while ((k = bch_btree_iter_next_filter(&iter, &b->keys,
                                                       bch_ptr_bad))) {
                        ret = btree(map_nodes_recurse, k, b,
                                    op, from, fn, flags);
@@ -2356,9 +2180,9 @@ static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
        struct bkey *k;
        struct btree_iter iter;
-        bch_btree_iter_init(b, &iter, from);
+        bch_btree_iter_init(&b->keys, &iter, from);
-        while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) {
+        while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) {
                ret = !b->level
                        ? fn(op, b, k)
                        : btree(map_keys_recurse, k, b, op, from, fn, flags);
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 767e75570896..af065e97e55c 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -130,20 +130,12 @@ struct btree {
        unsigned long           flags;
        uint16_t                written;        /* would be nice to kill */
        uint8_t                 level;
-        uint8_t                 nsets;
-        uint8_t                 page_order;
+        struct btree_keys       keys;
-        /*
-         * Set of sorted keys - the real btree node - plus a binary search tree
-         *
-         * sets[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
-         * to the memory we have allocated for this btree node. Additionally,
-         * set[0]->data points to the entire btree node as it exists on disk.
-         */
-        struct bset_tree        sets[MAX_BSETS];
        /* For outstanding btree writes, used as a lock - protects write_idx */
-        struct closure_with_waitlist    io;
+        struct closure          io;
+        struct semaphore        io_mutex;
        struct list_head        list;
        struct delayed_work     work;
@@ -179,24 +171,19 @@ static inline struct btree_write *btree_prev_write(struct btree *b)
        return b->writes + (btree_node_write_idx(b) ^ 1);
 }
-static inline unsigned bset_offset(struct btree *b, struct bset *i)
+static inline struct bset *btree_bset_first(struct btree *b)
 {
-        return (((size_t) i) - ((size_t) b->sets->data)) >> 9;
+        return b->keys.set->data;
 }
-static inline struct bset *write_block(struct btree *b)
+static inline struct bset *btree_bset_last(struct btree *b)
 {
-        return ((void *) b->sets[0].data) + b->written * block_bytes(b->c);
+        return bset_tree_last(&b->keys)->data;
 }
-static inline bool bset_written(struct btree *b, struct bset_tree *t)
+static inline unsigned bset_block_offset(struct btree *b, struct bset *i)
 {
-        return t->data < write_block(b);
+        return bset_sector_offset(&b->keys, i) >> b->c->block_bits;
-}
-static inline bool bkey_written(struct btree *b, struct bkey *k)
-{
-        return k < write_block(b)->start;
 }
 static inline void set_gc_sectors(struct cache_set *c)
@@ -204,21 +191,6 @@ static inline void set_gc_sectors(struct cache_set *c)
        atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16);
 }
-static inline struct bkey *bch_btree_iter_init(struct btree *b,
-                                               struct btree_iter *iter,
-                                               struct bkey *search)
-{
-        return __bch_btree_iter_init(b, iter, search, b->sets);
-}
-static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k)
-{
-        if (b->level)
-                return bch_btree_ptr_invalid(b->c, k);
-        else
-                return bch_extent_ptr_invalid(b->c, k);
-}
 void bkey_put(struct cache_set *c, struct bkey *k);
 /* Looping macros */
@@ -229,17 +201,12 @@ void bkey_put(struct cache_set *c, struct bkey *k);
             iter++)                                                    \
                hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash)
-#define for_each_key_filter(b, k, iter, filter)                         \
-        for (bch_btree_iter_init((b), (iter), NULL);                    \
-             ((k) = bch_btree_iter_next_filter((iter), b, filter));)
-#define for_each_key(b, k, iter)                                        \
-        for (bch_btree_iter_init((b), (iter), NULL);                    \
-             ((k) = bch_btree_iter_next(iter));)
 /* Recursing down the btree */
 struct btree_op {
+        /* for waiting on btree reserve in btree_split() */
+        wait_queue_t            wait;
        /* Btree level at which we start taking write locks */
        short                   lock;
@@ -249,6 +216,7 @@ struct btree_op {
 static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level)
 {
        memset(op, 0, sizeof(struct btree_op));
+        init_wait(&op->wait);
        op->lock = write_lock_level;
 }
@@ -267,7 +235,7 @@ static inline void rw_unlock(bool w, struct btree *b)
        (w ? up_write : up_read)(&b->lock);
 }
-void bch_btree_node_read(struct btree *);
+void bch_btree_node_read_done(struct btree *);
 void bch_btree_node_write(struct btree *, struct closure *);
 void bch_btree_set_root(struct btree *);
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index dfff2410322e..7a228de95fd7 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -11,19 +11,6 @@
 #include "closure.h"
-#define CL_FIELD(type, field)                                   \
-        case TYPE_ ## type:                                     \
-        return &container_of(cl, struct type, cl)->field
-static struct closure_waitlist *closure_waitlist(struct closure *cl)
-{
-        switch (cl->type) {
-                CL_FIELD(closure_with_waitlist, wait);
-        default:
-                return NULL;
-        }
-}
 static inline void closure_put_after_sub(struct closure *cl, int flags)
 {
        int r = flags & CLOSURE_REMAINING_MASK;
@@ -42,17 +29,10 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
                        closure_queue(cl);
                } else {
                        struct closure *parent = cl->parent;
-                        struct closure_waitlist *wait = closure_waitlist(cl);
                        closure_fn *destructor = cl->fn;
                        closure_debug_destroy(cl);
-                        smp_mb();
-                        atomic_set(&cl->remaining, -1);
-                        if (wait)
-                                closure_wake_up(wait);
                        if (destructor)
                                destructor(cl);
@@ -69,19 +49,18 @@ void closure_sub(struct closure *cl, int v)
 }
 EXPORT_SYMBOL(closure_sub);
+/**
+ * closure_put - decrement a closure's refcount
+ */
 void closure_put(struct closure *cl)
 {
        closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
 }
 EXPORT_SYMBOL(closure_put);
-static void set_waiting(struct closure *cl, unsigned long f)
+/**
-{
+ * closure_wake_up - wake up all closures on a wait list, without memory barrier
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+ */
-        cl->waiting_on = f;
-#endif
-}
 void __closure_wake_up(struct closure_waitlist *wait_list)
 {
        struct llist_node *list;
@@ -106,27 +85,34 @@ void __closure_wake_up(struct closure_waitlist *wait_list)
                cl = container_of(reverse, struct closure, list);
                reverse = llist_next(reverse);
-                set_waiting(cl, 0);
+                closure_set_waiting(cl, 0);
                closure_sub(cl, CLOSURE_WAITING + 1);
        }
 }
 EXPORT_SYMBOL(__closure_wake_up);
-bool closure_wait(struct closure_waitlist *list, struct closure *cl)
+/**
+ * closure_wait - add a closure to a waitlist
+ *
+ * @waitlist will own a ref on @cl, which will be released when
+ * closure_wake_up() is called on @waitlist.
+ *
+ */
+bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
 {
        if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
                return false;
-        set_waiting(cl, _RET_IP_);
+        closure_set_waiting(cl, _RET_IP_);
        atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
-        llist_add(&cl->list, &list->list);
+        llist_add(&cl->list, &waitlist->list);
        return true;
 }
 EXPORT_SYMBOL(closure_wait);
 /**
- * closure_sync() - sleep until a closure a closure has nothing left to wait on
+ * closure_sync - sleep until a closure a closure has nothing left to wait on
 *
 * Sleeps until the refcount hits 1 - the thread that's running the closure owns
 * the last refcount.
@@ -148,46 +134,6 @@ void closure_sync(struct closure *cl)
 }
 EXPORT_SYMBOL(closure_sync);
-/**
- * closure_trylock() - try to acquire the closure, without waiting
- * @cl:         closure to lock
- *
- * Returns true if the closure was succesfully locked.
- */
-bool closure_trylock(struct closure *cl, struct closure *parent)
-{
-        if (atomic_cmpxchg(&cl->remaining, -1,
-                           CLOSURE_REMAINING_INITIALIZER) != -1)
-                return false;
-        smp_mb();
-        cl->parent = parent;
-        if (parent)
-                closure_get(parent);
-        closure_set_ret_ip(cl);
-        closure_debug_create(cl);
-        return true;
-}
-EXPORT_SYMBOL(closure_trylock);
-void __closure_lock(struct closure *cl, struct closure *parent,
-                    struct closure_waitlist *wait_list)
-{
-        struct closure wait;
-        closure_init_stack(&wait);
-        while (1) {
-                if (closure_trylock(cl, parent))
-                        return;
-                closure_wait_event(wait_list, &wait,
-                                   atomic_read(&cl->remaining) == -1);
-        }
-}
-EXPORT_SYMBOL(__closure_lock);
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
 static LIST_HEAD(closure_list);
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 9762f1be3304..7ef7461912be 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -72,30 +72,6 @@
 * closure - _always_ use continue_at(). Doing so consistently will help
 * eliminate an entire class of particularly pernicious races.
 *
- * For a closure to wait on an arbitrary event, we need to introduce waitlists:
- *
- * struct closure_waitlist list;
- * closure_wait_event(list, cl, condition);
- * closure_wake_up(wait_list);
- *
- * These work analagously to wait_event() and wake_up() - except that instead of
- * operating on the current thread (for wait_event()) and lists of threads, they
- * operate on an explicit closure and lists of closures.
- *
- * Because it's a closure we can now wait either synchronously or
- * asynchronously. closure_wait_event() returns the current value of the
- * condition, and if it returned false continue_at() or closure_sync() can be
- * used to wait for it to become true.
- *
- * It's useful for waiting on things when you can't sleep in the context in
- * which you must check the condition (perhaps a spinlock held, or you might be
- * beneath generic_make_request() - in which case you can't sleep on IO).
- *
- * closure_wait_event() will wait either synchronously or asynchronously,
- * depending on whether the closure is in blocking mode or not. You can pick a
- * mode explicitly with closure_wait_event_sync() and
- * closure_wait_event_async(), which do just what you might expect.
- *
 * Lastly, you might have a wait list dedicated to a specific event, and have no
 * need for specifying the condition - you just want to wait until someone runs
 * closure_wake_up() on the appropriate wait list. In that case, just use
@@ -121,40 +97,6 @@
 * All this implies that a closure should typically be embedded in a particular
 * struct (which its refcount will normally control the lifetime of), and that
 * struct can very much be thought of as a stack frame.
- *
- * Locking:
- *
- * Closures are based on work items but they can be thought of as more like
- * threads - in that like threads and unlike work items they have a well
- * defined lifetime; they are created (with closure_init()) and eventually
- * complete after a continue_at(cl, NULL, NULL).
- *
- * Suppose you've got some larger structure with a closure embedded in it that's
- * used for periodically doing garbage collection. You only want one garbage
- * collection happening at a time, so the natural thing to do is protect it with
- * a lock. However, it's difficult to use a lock protecting a closure correctly
- * because the unlock should come after the last continue_to() (additionally, if
- * you're using the closure asynchronously a mutex won't work since a mutex has
- * to be unlocked by the same process that locked it).
- *
- * So to make it less error prone and more efficient, we also have the ability
- * to use closures as locks:
- *
- * closure_init_unlocked();
- * closure_trylock();
- *
- * That's all we need for trylock() - the last closure_put() implicitly unlocks
- * it for you.  But for closure_lock(), we also need a wait list:
- *
- * struct closure_with_waitlist frobnicator_cl;
- *
- * closure_init_unlocked(&frobnicator_cl);
- * closure_lock(&frobnicator_cl);
- *
- * A closure_with_waitlist embeds a closure and a wait list - much like struct
- * delayed_work embeds a work item and a timer_list. The important thing is, use
- * it exactly like you would a regular closure and closure_put() will magically
- * handle everything for you.
 */
 struct closure;
@@ -164,12 +106,6 @@ struct closure_waitlist {
        struct llist_head       list;
 };
-enum closure_type {
-        TYPE_closure                            = 0,
-        TYPE_closure_with_waitlist              = 1,
-        MAX_CLOSURE_TYPE                        = 1,
-};
 enum closure_state {
        /*
         * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
@@ -224,8 +160,6 @@ struct closure {
        atomic_t                remaining;
-        enum closure_type       type;
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
 #define CLOSURE_MAGIC_DEAD      0xc054dead
 #define CLOSURE_MAGIC_ALIVE     0xc054a11e
@@ -237,34 +171,12 @@ struct closure {
 #endif
 };
-struct closure_with_waitlist {
-        struct closure          cl;
-        struct closure_waitlist wait;
-};
-extern unsigned invalid_closure_type(void);
-#define __CLOSURE_TYPE(cl, _t)                                          \
-          __builtin_types_compatible_p(typeof(cl), struct _t)           \
-                ? TYPE_ ## _t :                                         \
-#define __closure_type(cl)                                              \
-(                                                                       \
-        __CLOSURE_TYPE(cl, closure)                                     \
-        __CLOSURE_TYPE(cl, closure_with_waitlist)                       \
-        invalid_closure_type()                                          \
-)
 void closure_sub(struct closure *cl, int v);
 void closure_put(struct closure *cl);
 void __closure_wake_up(struct closure_waitlist *list);
 bool closure_wait(struct closure_waitlist *list, struct closure *cl);
 void closure_sync(struct closure *cl);
-bool closure_trylock(struct closure *cl, struct closure *parent);
-void __closure_lock(struct closure *cl, struct closure *parent,
-                    struct closure_waitlist *wait_list);
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
 void closure_debug_init(void);
@@ -293,134 +205,97 @@ static inline void closure_set_ret_ip(struct closure *cl)
 #endif
 }
-static inline void closure_get(struct closure *cl)
+static inline void closure_set_waiting(struct closure *cl, unsigned long f)
 {
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
-        BUG_ON((atomic_inc_return(&cl->remaining) &
+        cl->waiting_on = f;
-                CLOSURE_REMAINING_MASK) <= 1);
-#else
-        atomic_inc(&cl->remaining);
 #endif
 }
-static inline void closure_set_stopped(struct closure *cl)
+static inline void __closure_end_sleep(struct closure *cl)
 {
-        atomic_sub(CLOSURE_RUNNING, &cl->remaining);
+        __set_current_state(TASK_RUNNING);
+        if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING)
+                atomic_sub(CLOSURE_SLEEPING, &cl->remaining);
 }
-static inline bool closure_is_unlocked(struct closure *cl)
+static inline void __closure_start_sleep(struct closure *cl)
 {
-        return atomic_read(&cl->remaining) == -1;
+        closure_set_ip(cl);
+        cl->task = current;
+        set_current_state(TASK_UNINTERRUPTIBLE);
+        if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
+                atomic_add(CLOSURE_SLEEPING, &cl->remaining);
 }
-static inline void do_closure_init(struct closure *cl, struct closure *parent,
+static inline void closure_set_stopped(struct closure *cl)
-                                   bool running)
 {
-        cl->parent = parent;
+        atomic_sub(CLOSURE_RUNNING, &cl->remaining);
-        if (parent)
+}
-                closure_get(parent);
-        if (running) {
-                closure_debug_create(cl);
-                atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
-        } else
-                atomic_set(&cl->remaining, -1);
+static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
+                                  struct workqueue_struct *wq)
+{
+        BUG_ON(object_is_on_stack(cl));
        closure_set_ip(cl);
+        cl->fn = fn;
+        cl->wq = wq;
+        /* between atomic_dec() in closure_put() */
+        smp_mb__before_atomic_dec();
 }
-/*
+static inline void closure_queue(struct closure *cl)
- * Hack to get at the embedded closure if there is one, by doing an unsafe cast:
+{
- * the result of __closure_type() is thrown away, it's used merely for type
+        struct workqueue_struct *wq = cl->wq;
- * checking.
+        if (wq) {
- */
+                INIT_WORK(&cl->work, cl->work.func);
-#define __to_internal_closure(cl)                               \
+                BUG_ON(!queue_work(wq, &cl->work));
-({                                                              \
+        } else
-        BUILD_BUG_ON(__closure_type(*cl) > MAX_CLOSURE_TYPE);   \
+                cl->fn(cl);
-        (struct closure *) cl;                                  \
+}
-})
-#define closure_init_type(cl, parent, running)                  \
-do {                                                            \
-        struct closure *_cl = __to_internal_closure(cl);        \
-        _cl->type = __closure_type(*(cl));                      \
-        do_closure_init(_cl, parent, running);                  \
-} while (0)
 /**
- * __closure_init() - Initialize a closure, skipping the memset()
+ * closure_get - increment a closure's refcount
- *
- * May be used instead of closure_init() when memory has already been zeroed.
 */
-#define __closure_init(cl, parent)                              \
+static inline void closure_get(struct closure *cl)
-        closure_init_type(cl, parent, true)
+{
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+        BUG_ON((atomic_inc_return(&cl->remaining) &
+                CLOSURE_REMAINING_MASK) <= 1);
+#else
+        atomic_inc(&cl->remaining);
+#endif
+}
 /**
- * closure_init() - Initialize a closure, setting the refcount to 1
+ * closure_init - Initialize a closure, setting the refcount to 1
 * @cl:         closure to initialize
 * @parent:     parent of the new closure. cl will take a refcount on it for its
 *              lifetime; may be NULL.
 */
-#define closure_init(cl, parent)                                \
+static inline void closure_init(struct closure *cl, struct closure *parent)
-do {                                                            \
-        memset((cl), 0, sizeof(*(cl)));                         \
-        __closure_init(cl, parent);                             \
-} while (0)
-static inline void closure_init_stack(struct closure *cl)
 {
        memset(cl, 0, sizeof(struct closure));
-        atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK);
+        cl->parent = parent;
-}
+        if (parent)
+                closure_get(parent);
-/**
- * closure_init_unlocked() - Initialize a closure but leave it unlocked.
- * @cl:         closure to initialize
- *
- * For when the closure will be used as a lock. The closure may not be used
- * until after a closure_lock() or closure_trylock().
- */
-#define closure_init_unlocked(cl)                               \
-do {                                                            \
-        memset((cl), 0, sizeof(*(cl)));                         \
-        closure_init_type(cl, NULL, false);                     \
-} while (0)
-/**
- * closure_lock() - lock and initialize a closure.
- * @cl:         the closure to lock
- * @parent:     the new parent for this closure
- *
- * The closure must be of one of the types that has a waitlist (otherwise we
- * wouldn't be able to sleep on contention).
- *
- * @parent has exactly the same meaning as in closure_init(); if non null, the
- * closure will take a reference on @parent which will be released when it is
- * unlocked.
- */
-#define closure_lock(cl, parent)                                \
-        __closure_lock(__to_internal_closure(cl), parent, &(cl)->wait)
-static inline void __closure_end_sleep(struct closure *cl)
+        atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
-{
-        __set_current_state(TASK_RUNNING);
-        if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING)
+        closure_debug_create(cl);
-                atomic_sub(CLOSURE_SLEEPING, &cl->remaining);
+        closure_set_ip(cl);
 }
-static inline void __closure_start_sleep(struct closure *cl)
+static inline void closure_init_stack(struct closure *cl)
 {
-        closure_set_ip(cl);
+        memset(cl, 0, sizeof(struct closure));
-        cl->task = current;
+        atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK);
-        set_current_state(TASK_UNINTERRUPTIBLE);
-        if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
-                atomic_add(CLOSURE_SLEEPING, &cl->remaining);
 }
 /**
- * closure_wake_up() - wake up all closures on a wait list.
+ * closure_wake_up - wake up all closures on a wait list.
 */
 static inline void closure_wake_up(struct closure_waitlist *list)
 {
@@ -428,69 +303,19 @@ static inline void closure_wake_up(struct closure_waitlist *list)
        __closure_wake_up(list);
 }
-/*
+/**
- * Wait on an event, synchronously or asynchronously - analogous to wait_event()
+ * continue_at - jump to another function with barrier
- * but for closures.
+ *
- *
+ * After @cl is no longer waiting on anything (i.e. all outstanding refs have
- * The loop is oddly structured so as to avoid a race; we must check the
+ * been dropped with closure_put()), it will resume execution at @fn running out
- * condition again after we've added ourself to the waitlist. We know if we were
+ * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly).
- * already on the waitlist because closure_wait() returns false; thus, we only
+ *
- * schedule or break if closure_wait() returns false. If it returns true, we
+ * NOTE: This macro expands to a return in the calling function!
- * just loop again - rechecking the condition.
+ *
- *
+ * This is because after calling continue_at() you no longer have a ref on @cl,
- * The __closure_wake_up() is necessary because we may race with the event
+ * and whatever @cl owns may be freed out from under you - a running closure fn
- * becoming true; i.e. we see event false -> wait -> recheck condition, but the
+ * has a ref on its own closure which continue_at() drops.
- * thread that made the event true may have called closure_wake_up() before we
- * added ourself to the wait list.
- *
- * We have to call closure_sync() at the end instead of just
- * __closure_end_sleep() because a different thread might've called
- * closure_wake_up() before us and gotten preempted before they dropped the
- * refcount on our closure. If this was a stack allocated closure, that would be
- * bad.
 */
-#define closure_wait_event(list, cl, condition)                         \
-({                                                                      \
-        typeof(condition) ret;                                          \
-                                                                        \
-        while (1) {                                                     \
-                ret = (condition);                                      \
-                if (ret) {                                              \
-                        __closure_wake_up(list);                        \
-                        closure_sync(cl);                               \
-                        break;                                          \
-                }                                                       \
-                                                                        \
-                __closure_start_sleep(cl);                              \
-                                                                        \
-                if (!closure_wait(list, cl))                            \
-                        schedule();                                     \
-        }                                                               \
-                                                                        \
-        ret;                                                            \
-})
-static inline void closure_queue(struct closure *cl)
-{
-        struct workqueue_struct *wq = cl->wq;
-        if (wq) {
-                INIT_WORK(&cl->work, cl->work.func);
-                BUG_ON(!queue_work(wq, &cl->work));
-        } else
-                cl->fn(cl);
-}
-static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
-                                  struct workqueue_struct *wq)
-{
-        BUG_ON(object_is_on_stack(cl));
-        closure_set_ip(cl);
-        cl->fn = fn;
-        cl->wq = wq;
-        /* between atomic_dec() in closure_put() */
-        smp_mb__before_atomic_dec();
-}
 #define continue_at(_cl, _fn, _wq)                                      \
 do {                                                                    \
        set_closure_fn(_cl, _fn, _wq);                                  \
@@ -498,8 +323,28 @@ do {									\
        return;                                                         \
 } while (0)
+/**
+ * closure_return - finish execution of a closure
+ *
+ * This is used to indicate that @cl is finished: when all outstanding refs on
+ * @cl have been dropped @cl's ref on its parent closure (as passed to
+ * closure_init()) will be dropped, if one was specified - thus this can be
+ * thought of as returning to the parent closure.
+ */
 #define closure_return(_cl)     continue_at((_cl), NULL, NULL)
+/**
+ * continue_at_nobarrier - jump to another function without barrier
+ *
+ * Causes @fn to be executed out of @cl, in @wq context (or called directly if
+ * @wq is NULL).
+ *
+ * NOTE: like continue_at(), this macro expands to a return in the caller!
+ *
+ * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn,
+ * thus it's not safe to touch anything protected by @cl after a
+ * continue_at_nobarrier().
+ */
 #define continue_at_nobarrier(_cl, _fn, _wq)                            \
 do {                                                                    \
        set_closure_fn(_cl, _fn, _wq);                                  \
@@ -507,6 +352,15 @@ do {									\
        return;                                                         \
 } while (0)
+/**
+ * closure_return - finish execution of a closure, with destructor
+ *
+ * Works like closure_return(), except @destructor will be called when all
+ * outstanding refs on @cl have been dropped; @destructor may be used to safely
+ * free the memory occupied by @cl, and it is called with the ref on the parent
+ * closure still held - so @destructor could safely return an item to a
+ * freelist protected by @cl's parent.
+ */
 #define closure_return_with_destructor(_cl, _destructor)                \
 do {                                                                    \
        set_closure_fn(_cl, _destructor, NULL);                         \
@@ -514,6 +368,13 @@ do {									\
        return;                                                         \
 } while (0)
+/**
+ * closure_call - execute @fn out of a new, uninitialized closure
+ *
+ * Typically used when running out of one closure, and we want to run @fn
+ * asynchronously out of a new closure - @parent will then wait for @cl to
+ * finish.
+ */
 static inline void closure_call(struct closure *cl, closure_fn fn,
                                struct workqueue_struct *wq,
                                struct closure *parent)
@@ -522,12 +383,4 @@ static inline void closure_call(struct closure *cl, closure_fn fn,
        continue_at_nobarrier(cl, fn, wq);
 }
-static inline void closure_trylock_call(struct closure *cl, closure_fn fn,
-                                        struct workqueue_struct *wq,
-                                        struct closure *parent)
-{
-        if (closure_trylock(cl, parent))
-                continue_at_nobarrier(cl, fn, wq);
-}
 #endif /* _LINUX_CLOSURE_H */
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 03cb4d114e16..8b1f1d5c1819 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -8,6 +8,7 @@
 #include "bcache.h"
 #include "btree.h"
 #include "debug.h"
+#include "extents.h"
 #include <linux/console.h>
 #include <linux/debugfs.h>
@@ -17,156 +18,88 @@
 static struct dentry *debug;
-const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
-{
-        unsigned i;
-        for (i = 0; i < KEY_PTRS(k); i++)
-                if (ptr_available(c, k, i)) {
-                        struct cache *ca = PTR_CACHE(c, k, i);
-                        size_t bucket = PTR_BUCKET_NR(c, k, i);
-                        size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
-                        if (KEY_SIZE(k) + r > c->sb.bucket_size)
-                                return "bad, length too big";
-                        if (bucket <  ca->sb.first_bucket)
-                                return "bad, short offset";
-                        if (bucket >= ca->sb.nbuckets)
-                                return "bad, offset past end of device";
-                        if (ptr_stale(c, k, i))
-                                return "stale";
-                }
-        if (!bkey_cmp(k, &ZERO_KEY))
-                return "bad, null key";
-        if (!KEY_PTRS(k))
-                return "bad, no pointers";
-        if (!KEY_SIZE(k))
-                return "zeroed key";
-        return "";
-}
-int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k)
-{
-        unsigned i = 0;
-        char *out = buf, *end = buf + size;
-#define p(...)  (out += scnprintf(out, end - out, __VA_ARGS__))
-        p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k));
-        if (KEY_PTRS(k))
-                while (1) {
-                        p("%llu:%llu gen %llu",
-                          PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i));
-                        if (++i == KEY_PTRS(k))
-                                break;
-                        p(", ");
-                }
-        p("]");
-        if (KEY_DIRTY(k))
-                p(" dirty");
-        if (KEY_CSUM(k))
-                p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
-#undef p
-        return out - buf;
-}
 #ifdef CONFIG_BCACHE_DEBUG
-static void dump_bset(struct btree *b, struct bset *i)
+#define for_each_written_bset(b, start, i)                              \
-{
+        for (i = (start);                                               \
-        struct bkey *k, *next;
+             (void *) i < (void *) (start) + (KEY_SIZE(&b->key) << 9) &&\
-        unsigned j;
+             i->seq == (start)->seq;                                    \
-        char buf[80];
+             i = (void *) i + set_blocks(i, block_bytes(b->c)) *        \
+                 block_bytes(b->c))
-        for (k = i->start; k < end(i); k = next) {
-                next = bkey_next(k);
-                bch_bkey_to_text(buf, sizeof(buf), k);
-                printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b),
-                       (uint64_t *) k - i->d, i->keys, buf);
-                for (j = 0; j < KEY_PTRS(k); j++) {
-                        size_t n = PTR_BUCKET_NR(b->c, k, j);
-                        printk(" bucket %zu", n);
-                        if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)
-                                printk(" prio %i",
-                                       PTR_BUCKET(b->c, k, j)->prio);
-                }
-                printk(" %s\n", bch_ptr_status(b->c, k));
+void bch_btree_verify(struct btree *b)
-                if (next < end(i) &&
-                    bkey_cmp(k, !b->level ? &START_KEY(next) : next) > 0)
-                        printk(KERN_ERR "Key skipped backwards\n");
-        }
-}
-static void bch_dump_bucket(struct btree *b)
-{
-        unsigned i;
-        console_lock();
-        for (i = 0; i <= b->nsets; i++)
-                dump_bset(b, b->sets[i].data);
-        console_unlock();
-}
-void bch_btree_verify(struct btree *b, struct bset *new)
 {
        struct btree *v = b->c->verify_data;
-        struct closure cl;
+        struct bset *ondisk, *sorted, *inmemory;
-        closure_init_stack(&cl);
+        struct bio *bio;
-        if (!b->c->verify)
+        if (!b->c->verify || !b->c->verify_ondisk)
                return;
-        closure_wait_event(&b->io.wait, &cl,
+        down(&b->io_mutex);
-                           atomic_read(&b->io.cl.remaining) == -1);
        mutex_lock(&b->c->verify_lock);
+        ondisk = b->c->verify_ondisk;
+        sorted = b->c->verify_data->keys.set->data;
+        inmemory = b->keys.set->data;
        bkey_copy(&v->key, &b->key);
        v->written = 0;
        v->level = b->level;
+        v->keys.ops = b->keys.ops;
+        bio = bch_bbio_alloc(b->c);
+        bio->bi_bdev            = PTR_CACHE(b->c, &b->key, 0)->bdev;
+        bio->bi_iter.bi_sector  = PTR_OFFSET(&b->key, 0);
+        bio->bi_iter.bi_size    = KEY_SIZE(&v->key) << 9;
+        bch_bio_map(bio, sorted);
-        bch_btree_node_read(v);
+        submit_bio_wait(REQ_META|READ_SYNC, bio);
-        closure_wait_event(&v->io.wait, &cl,
+        bch_bbio_free(bio, b->c);
-                           atomic_read(&b->io.cl.remaining) == -1);
-        if (new->keys != v->sets[0].data->keys ||
+        memcpy(ondisk, sorted, KEY_SIZE(&v->key) << 9);
-            memcmp(new->start,
-                   v->sets[0].data->start,
+        bch_btree_node_read_done(v);
-                   (void *) end(new) - (void *) new->start)) {
+        sorted = v->keys.set->data;
-                unsigned i, j;
+        if (inmemory->keys != sorted->keys ||
+            memcmp(inmemory->start,
+                   sorted->start,
+                   (void *) bset_bkey_last(inmemory) - (void *) inmemory->start)) {
+                struct bset *i;
+                unsigned j;
                console_lock();
-                printk(KERN_ERR "*** original memory node:\n");
+                printk(KERN_ERR "*** in memory:\n");
-                for (i = 0; i <= b->nsets; i++)
+                bch_dump_bset(&b->keys, inmemory, 0);
-                        dump_bset(b, b->sets[i].data);
-                printk(KERN_ERR "*** sorted memory node:\n");
+                printk(KERN_ERR "*** read back in:\n");
-                dump_bset(b, new);
+                bch_dump_bset(&v->keys, sorted, 0);
-                printk(KERN_ERR "*** on disk node:\n");
+                for_each_written_bset(b, ondisk, i) {
-                dump_bset(v, v->sets[0].data);
+                        unsigned block = ((void *) i - (void *) ondisk) /
+                                block_bytes(b->c);
+                        printk(KERN_ERR "*** on disk block %u:\n", block);
+                        bch_dump_bset(&b->keys, i, block);
+                }
-                for (j = 0; j < new->keys; j++)
+                printk(KERN_ERR "*** block %zu not written\n",
-                        if (new->d[j] != v->sets[0].data->d[j])
+                       ((void *) i - (void *) ondisk) / block_bytes(b->c));
+                for (j = 0; j < inmemory->keys; j++)
+                        if (inmemory->d[j] != sorted->d[j])
                                break;
+                printk(KERN_ERR "b->written %u\n", b->written);
                console_unlock();
                panic("verify failed at %u\n", j);
        }
        mutex_unlock(&b->c->verify_lock);
+        up(&b->io_mutex);
 }
 void bch_data_verify(struct cached_dev *dc, struct bio *bio)
@@ -207,74 +140,6 @@ out_put:
        bio_put(check);
 }
-int __bch_count_data(struct btree *b)
-{
-        unsigned ret = 0;
-        struct btree_iter iter;
-        struct bkey *k;
-        if (!b->level)
-                for_each_key(b, k, &iter)
-                        ret += KEY_SIZE(k);
-        return ret;
-}
-void __bch_check_keys(struct btree *b, const char *fmt, ...)
-{
-        va_list args;
-        struct bkey *k, *p = NULL;
-        struct btree_iter iter;
-        const char *err;
-        for_each_key(b, k, &iter) {
-                if (!b->level) {
-                        err = "Keys out of order";
-                        if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0)
-                                goto bug;
-                        if (bch_ptr_invalid(b, k))
-                                continue;
-                        err =  "Overlapping keys";
-                        if (p && bkey_cmp(p, &START_KEY(k)) > 0)
-                                goto bug;
-                } else {
-                        if (bch_ptr_bad(b, k))
-                                continue;
-                        err = "Duplicate keys";
-                        if (p && !bkey_cmp(p, k))
-                                goto bug;
-                }
-                p = k;
-        }
-        err = "Key larger than btree node key";
-        if (p && bkey_cmp(p, &b->key) > 0)
-                goto bug;
-        return;
-bug:
-        bch_dump_bucket(b);
-        va_start(args, fmt);
-        vprintk(fmt, args);
-        va_end(args);
-        panic("bcache error: %s:\n", err);
-}
-void bch_btree_iter_next_check(struct btree_iter *iter)
-{
-        struct bkey *k = iter->data->k, *next = bkey_next(k);
-        if (next < iter->data->end &&
-            bkey_cmp(k, iter->b->level ? next : &START_KEY(next)) > 0) {
-                bch_dump_bucket(iter->b);
-                panic("Key skipped backwards\n");
-        }
-}
 #endif
 #ifdef CONFIG_DEBUG_FS
@@ -321,7 +186,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf,
                if (!w)
                        break;
-                bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key);
+                bch_extent_to_text(kbuf, sizeof(kbuf), &w->key);
                i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf);
                bch_keybuf_del(&i->keys, w);
        }
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h
index 2ede60e31874..1f63c195d247 100644
--- a/drivers/md/bcache/debug.h
+++ b/drivers/md/bcache/debug.h
@@ -1,47 +1,30 @@
 #ifndef _BCACHE_DEBUG_H
 #define _BCACHE_DEBUG_H
-/* Btree/bkey debug printing */
+struct bio;
+struct cached_dev;
-int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k);
+struct cache_set;
 #ifdef CONFIG_BCACHE_DEBUG
-void bch_btree_verify(struct btree *, struct bset *);
+void bch_btree_verify(struct btree *);
 void bch_data_verify(struct cached_dev *, struct bio *);
-int __bch_count_data(struct btree *);
-void __bch_check_keys(struct btree *, const char *, ...);
-void bch_btree_iter_next_check(struct btree_iter *);
-#define EBUG_ON(cond)                   BUG_ON(cond)
 #define expensive_debug_checks(c)       ((c)->expensive_debug_checks)
 #define key_merging_disabled(c)         ((c)->key_merging_disabled)
 #define bypass_torture_test(d)          ((d)->bypass_torture_test)
 #else /* DEBUG */
-static inline void bch_btree_verify(struct btree *b, struct bset *i) {}
+static inline void bch_btree_verify(struct btree *b) {}
 static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {}
-static inline int __bch_count_data(struct btree *b) { return -1; }
-static inline void __bch_check_keys(struct btree *b, const char *fmt, ...) {}
-static inline void bch_btree_iter_next_check(struct btree_iter *iter) {}
-#define EBUG_ON(cond)                   do { if (cond); } while (0)
 #define expensive_debug_checks(c)       0
 #define key_merging_disabled(c)         0
 #define bypass_torture_test(d)          0
 #endif
-#define bch_count_data(b)                                               \
-        (expensive_debug_checks((b)->c) ? __bch_count_data(b) : -1)
-#define bch_check_keys(b, ...)                                          \
-do {                                                                    \
-        if (expensive_debug_checks((b)->c))                             \
-                __bch_check_keys(b, __VA_ARGS__);                       \
-} while (0)
 #ifdef CONFIG_DEBUG_FS
 void bch_debug_init_cache_set(struct cache_set *);
 #else
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
new file mode 100644
index 000000000000..c3ead586dc27
--- /dev/null
+++ b/drivers/md/bcache/extents.c
@@ -0,0 +1,616 @@
+/*
+ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
+ *
+ * Uses a block device as cache for other block devices; optimized for SSDs.
+ * All allocation is done in buckets, which should match the erase block size
+ * of the device.
+ *
+ * Buckets containing cached data are kept on a heap sorted by priority;
+ * bucket priority is increased on cache hit, and periodically all the buckets
+ * on the heap have their priority scaled down. This currently is just used as
+ * an LRU but in the future should allow for more intelligent heuristics.
+ *
+ * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
+ * counter. Garbage collection is used to remove stale pointers.
+ *
+ * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
+ * as keys are inserted we only sort the pages that have not yet been written.
+ * When garbage collection is run, we resort the entire node.
+ *
+ * All configuration is done via sysfs; see Documentation/bcache.txt.
+ */
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "extents.h"
+#include "writeback.h"
+static void sort_key_next(struct btree_iter *iter,
+                          struct btree_iter_set *i)
+{
+        i->k = bkey_next(i->k);
+        if (i->k == i->end)
+                *i = iter->data[--iter->used];
+}
+static bool bch_key_sort_cmp(struct btree_iter_set l,
+                             struct btree_iter_set r)
+{
+        int64_t c = bkey_cmp(l.k, r.k);
+        return c ? c > 0 : l.k < r.k;
+}
+static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
+{
+        unsigned i;
+        for (i = 0; i < KEY_PTRS(k); i++)
+                if (ptr_available(c, k, i)) {
+                        struct cache *ca = PTR_CACHE(c, k, i);
+                        size_t bucket = PTR_BUCKET_NR(c, k, i);
+                        size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
+                        if (KEY_SIZE(k) + r > c->sb.bucket_size ||
+                            bucket <  ca->sb.first_bucket ||
+                            bucket >= ca->sb.nbuckets)
+                                return true;
+                }
+        return false;
+}
+/* Common among btree and extent ptrs */
+static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
+{
+        unsigned i;
+        for (i = 0; i < KEY_PTRS(k); i++)
+                if (ptr_available(c, k, i)) {
+                        struct cache *ca = PTR_CACHE(c, k, i);
+                        size_t bucket = PTR_BUCKET_NR(c, k, i);
+                        size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
+                        if (KEY_SIZE(k) + r > c->sb.bucket_size)
+                                return "bad, length too big";
+                        if (bucket <  ca->sb.first_bucket)
+                                return "bad, short offset";
+                        if (bucket >= ca->sb.nbuckets)
+                                return "bad, offset past end of device";
+                        if (ptr_stale(c, k, i))
+                                return "stale";
+                }
+        if (!bkey_cmp(k, &ZERO_KEY))
+                return "bad, null key";
+        if (!KEY_PTRS(k))
+                return "bad, no pointers";
+        if (!KEY_SIZE(k))
+                return "zeroed key";
+        return "";
+}
+void bch_extent_to_text(char *buf, size_t size, const struct bkey *k)
+{
+        unsigned i = 0;
+        char *out = buf, *end = buf + size;
+#define p(...)  (out += scnprintf(out, end - out, __VA_ARGS__))
+        p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_START(k), KEY_SIZE(k));
+        for (i = 0; i < KEY_PTRS(k); i++) {
+                if (i)
+                        p(", ");
+                if (PTR_DEV(k, i) == PTR_CHECK_DEV)
+                        p("check dev");
+                else
+                        p("%llu:%llu gen %llu", PTR_DEV(k, i),
+                          PTR_OFFSET(k, i), PTR_GEN(k, i));
+        }
+        p("]");
+        if (KEY_DIRTY(k))
+                p(" dirty");
+        if (KEY_CSUM(k))
+                p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
+#undef p
+}
+static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k)
+{
+        struct btree *b = container_of(keys, struct btree, keys);
+        unsigned j;
+        char buf[80];
+        bch_extent_to_text(buf, sizeof(buf), k);
+        printk(" %s", buf);
+        for (j = 0; j < KEY_PTRS(k); j++) {
+                size_t n = PTR_BUCKET_NR(b->c, k, j);
+                printk(" bucket %zu", n);
+                if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)
+                        printk(" prio %i",
+                               PTR_BUCKET(b->c, k, j)->prio);
+        }
+        printk(" %s\n", bch_ptr_status(b->c, k));
+}
+/* Btree ptrs */
+bool __bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k)
+{
+        char buf[80];
+        if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))
+                goto bad;
+        if (__ptr_invalid(c, k))
+                goto bad;
+        return false;
+bad:
+        bch_extent_to_text(buf, sizeof(buf), k);
+        cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k));
+        return true;
+}
+static bool bch_btree_ptr_invalid(struct btree_keys *bk, const struct bkey *k)
+{
+        struct btree *b = container_of(bk, struct btree, keys);
+        return __bch_btree_ptr_invalid(b->c, k);
+}
+static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k)
+{
+        unsigned i;
+        char buf[80];
+        struct bucket *g;
+        if (mutex_trylock(&b->c->bucket_lock)) {
+                for (i = 0; i < KEY_PTRS(k); i++)
+                        if (ptr_available(b->c, k, i)) {
+                                g = PTR_BUCKET(b->c, k, i);
+                                if (KEY_DIRTY(k) ||
+                                    g->prio != BTREE_PRIO ||
+                                    (b->c->gc_mark_valid &&
+                                     GC_MARK(g) != GC_MARK_METADATA))
+                                        goto err;
+                        }
+                mutex_unlock(&b->c->bucket_lock);
+        }
+        return false;
+err:
+        mutex_unlock(&b->c->bucket_lock);
+        bch_extent_to_text(buf, sizeof(buf), k);
+        btree_bug(b,
+"inconsistent btree pointer %s: bucket %li pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
+                  buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
+                  g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
+        return true;
+}
+static bool bch_btree_ptr_bad(struct btree_keys *bk, const struct bkey *k)
+{
+        struct btree *b = container_of(bk, struct btree, keys);
+        unsigned i;
+        if (!bkey_cmp(k, &ZERO_KEY) ||
+            !KEY_PTRS(k) ||
+            bch_ptr_invalid(bk, k))
+                return true;
+        for (i = 0; i < KEY_PTRS(k); i++)
+                if (!ptr_available(b->c, k, i) ||
+                    ptr_stale(b->c, k, i))
+                        return true;
+        if (expensive_debug_checks(b->c) &&
+            btree_ptr_bad_expensive(b, k))
+                return true;
+        return false;
+}
+static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk,
+                                       struct bkey *insert,
+                                       struct btree_iter *iter,
+                                       struct bkey *replace_key)
+{
+        struct btree *b = container_of(bk, struct btree, keys);
+        if (!KEY_OFFSET(insert))
+                btree_current_write(b)->prio_blocked++;
+        return false;
+}
+const struct btree_keys_ops bch_btree_keys_ops = {
+        .sort_cmp       = bch_key_sort_cmp,
+        .insert_fixup   = bch_btree_ptr_insert_fixup,
+        .key_invalid    = bch_btree_ptr_invalid,
+        .key_bad        = bch_btree_ptr_bad,
+        .key_to_text    = bch_extent_to_text,
+        .key_dump       = bch_bkey_dump,
+};
+/* Extents */
+/*
+ * Returns true if l > r - unless l == r, in which case returns true if l is
+ * older than r.
+ *
+ * Necessary for btree_sort_fixup() - if there are multiple keys that compare
+ * equal in different sets, we have to process them newest to oldest.
+ */
+static bool bch_extent_sort_cmp(struct btree_iter_set l,
+                                struct btree_iter_set r)
+{
+        int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k));
+        return c ? c > 0 : l.k < r.k;
+}
+static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
+                                          struct bkey *tmp)
+{
+        while (iter->used > 1) {
+                struct btree_iter_set *top = iter->data, *i = top + 1;
+                if (iter->used > 2 &&
+                    bch_extent_sort_cmp(i[0], i[1]))
+                        i++;
+                if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0)
+                        break;
+                if (!KEY_SIZE(i->k)) {
+                        sort_key_next(iter, i);
+                        heap_sift(iter, i - top, bch_extent_sort_cmp);
+                        continue;
+                }
+                if (top->k > i->k) {
+                        if (bkey_cmp(top->k, i->k) >= 0)
+                                sort_key_next(iter, i);
+                        else
+                                bch_cut_front(top->k, i->k);
+                        heap_sift(iter, i - top, bch_extent_sort_cmp);
+                } else {
+                        /* can't happen because of comparison func */
+                        BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k)));
+                        if (bkey_cmp(i->k, top->k) < 0) {
+                                bkey_copy(tmp, top->k);
+                                bch_cut_back(&START_KEY(i->k), tmp);
+                                bch_cut_front(i->k, top->k);
+                                heap_sift(iter, 0, bch_extent_sort_cmp);
+                                return tmp;
+                        } else {
+                                bch_cut_back(&START_KEY(i->k), top->k);
+                        }
+                }
+        }
+        return NULL;
+}
+static bool bch_extent_insert_fixup(struct btree_keys *b,
+                                    struct bkey *insert,
+                                    struct btree_iter *iter,
+                                    struct bkey *replace_key)
+{
+        struct cache_set *c = container_of(b, struct btree, keys)->c;
+        void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
+        {
+                if (KEY_DIRTY(k))
+                        bcache_dev_sectors_dirty_add(c, KEY_INODE(k),
+                                                     offset, -sectors);
+        }
+        uint64_t old_offset;
+        unsigned old_size, sectors_found = 0;
+        BUG_ON(!KEY_OFFSET(insert));
+        BUG_ON(!KEY_SIZE(insert));
+        while (1) {
+                struct bkey *k = bch_btree_iter_next(iter);
+                if (!k)
+                        break;
+                if (bkey_cmp(&START_KEY(k), insert) >= 0) {
+                        if (KEY_SIZE(k))
+                                break;
+                        else
+                                continue;
+                }
+                if (bkey_cmp(k, &START_KEY(insert)) <= 0)
+                        continue;
+                old_offset = KEY_START(k);
+                old_size = KEY_SIZE(k);
+                /*
+                 * We might overlap with 0 size extents; we can't skip these
+                 * because if they're in the set we're inserting to we have to
+                 * adjust them so they don't overlap with the key we're
+                 * inserting. But we don't want to check them for replace
+                 * operations.
+                 */
+                if (replace_key && KEY_SIZE(k)) {
+                        /*
+                         * k might have been split since we inserted/found the
+                         * key we're replacing
+                         */
+                        unsigned i;
+                        uint64_t offset = KEY_START(k) -
+                                KEY_START(replace_key);
+                        /* But it must be a subset of the replace key */
+                        if (KEY_START(k) < KEY_START(replace_key) ||
+                            KEY_OFFSET(k) > KEY_OFFSET(replace_key))
+                                goto check_failed;
+                        /* We didn't find a key that we were supposed to */
+                        if (KEY_START(k) > KEY_START(insert) + sectors_found)
+                                goto check_failed;
+                        if (!bch_bkey_equal_header(k, replace_key))
+                                goto check_failed;
+                        /* skip past gen */
+                        offset <<= 8;
+                        BUG_ON(!KEY_PTRS(replace_key));
+                        for (i = 0; i < KEY_PTRS(replace_key); i++)
+                                if (k->ptr[i] != replace_key->ptr[i] + offset)
+                                        goto check_failed;
+                        sectors_found = KEY_OFFSET(k) - KEY_START(insert);
+                }
+                if (bkey_cmp(insert, k) < 0 &&
+                    bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
+                        /*
+                         * We overlapped in the middle of an existing key: that
+                         * means we have to split the old key. But we have to do
+                         * slightly different things depending on whether the
+                         * old key has been written out yet.
+                         */
+                        struct bkey *top;
+                        subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
+                        if (bkey_written(b, k)) {
+                                /*
+                                 * We insert a new key to cover the top of the
+                                 * old key, and the old key is modified in place
+                                 * to represent the bottom split.
+                                 *
+                                 * It's completely arbitrary whether the new key
+                                 * is the top or the bottom, but it has to match
+                                 * up with what btree_sort_fixup() does - it
+                                 * doesn't check for this kind of overlap, it
+                                 * depends on us inserting a new key for the top
+                                 * here.
+                                 */
+                                top = bch_bset_search(b, bset_tree_last(b),
+                                                      insert);
+                                bch_bset_insert(b, top, k);
+                        } else {
+                                BKEY_PADDED(key) temp;
+                                bkey_copy(&temp.key, k);
+                                bch_bset_insert(b, k, &temp.key);
+                                top = bkey_next(k);
+                        }
+                        bch_cut_front(insert, top);
+                        bch_cut_back(&START_KEY(insert), k);
+                        bch_bset_fix_invalidated_key(b, k);
+                        goto out;
+                }
+                if (bkey_cmp(insert, k) < 0) {
+                        bch_cut_front(insert, k);
+                } else {
+                        if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
+                                old_offset = KEY_START(insert);
+                        if (bkey_written(b, k) &&
+                            bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
+                                /*
+                                 * Completely overwrote, so we don't have to
+                                 * invalidate the binary search tree
+                                 */
+                                bch_cut_front(k, k);
+                        } else {
+                                __bch_cut_back(&START_KEY(insert), k);
+                                bch_bset_fix_invalidated_key(b, k);
+                        }
+                }
+                subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
+        }
+check_failed:
+        if (replace_key) {
+                if (!sectors_found) {
+                        return true;
+                } else if (sectors_found < KEY_SIZE(insert)) {
+                        SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
+                                       (KEY_SIZE(insert) - sectors_found));
+                        SET_KEY_SIZE(insert, sectors_found);
+                }
+        }
+out:
+        if (KEY_DIRTY(insert))
+                bcache_dev_sectors_dirty_add(c, KEY_INODE(insert),
+                                             KEY_START(insert),
+                                             KEY_SIZE(insert));
+        return false;
+}
+static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k)
+{
+        struct btree *b = container_of(bk, struct btree, keys);
+        char buf[80];
+        if (!KEY_SIZE(k))
+                return true;
+        if (KEY_SIZE(k) > KEY_OFFSET(k))
+                goto bad;
+        if (__ptr_invalid(b->c, k))
+                goto bad;
+        return false;
+bad:
+        bch_extent_to_text(buf, sizeof(buf), k);
+        cache_bug(b->c, "spotted extent %s: %s", buf, bch_ptr_status(b->c, k));
+        return true;
+}
+static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k,
+                                     unsigned ptr)
+{
+        struct bucket *g = PTR_BUCKET(b->c, k, ptr);
+        char buf[80];
+        if (mutex_trylock(&b->c->bucket_lock)) {
+                if (b->c->gc_mark_valid &&
+                    ((GC_MARK(g) != GC_MARK_DIRTY &&
+                      KEY_DIRTY(k)) ||
+                     GC_MARK(g) == GC_MARK_METADATA))
+                        goto err;
+                if (g->prio == BTREE_PRIO)
+                        goto err;
+                mutex_unlock(&b->c->bucket_lock);
+        }
+        return false;
+err:
+        mutex_unlock(&b->c->bucket_lock);
+        bch_extent_to_text(buf, sizeof(buf), k);
+        btree_bug(b,
+"inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
+                  buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin),
+                  g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
+        return true;
+}
+static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k)
+{
+        struct btree *b = container_of(bk, struct btree, keys);
+        struct bucket *g;
+        unsigned i, stale;
+        if (!KEY_PTRS(k) ||
+            bch_extent_invalid(bk, k))
+                return true;
+        for (i = 0; i < KEY_PTRS(k); i++)
+                if (!ptr_available(b->c, k, i))
+                        return true;
+        if (!expensive_debug_checks(b->c) && KEY_DIRTY(k))
+                return false;
+        for (i = 0; i < KEY_PTRS(k); i++) {
+                g = PTR_BUCKET(b->c, k, i);
+                stale = ptr_stale(b->c, k, i);
+                btree_bug_on(stale > 96, b,
+                             "key too stale: %i, need_gc %u",
+                             stale, b->c->need_gc);
+                btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
+                             b, "stale dirty pointer");
+                if (stale)
+                        return true;
+                if (expensive_debug_checks(b->c) &&
+                    bch_extent_bad_expensive(b, k, i))
+                        return true;
+        }
+        return false;
+}
+static uint64_t merge_chksums(struct bkey *l, struct bkey *r)
+{
+        return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) &
+                ~((uint64_t)1 << 63);
+}
+static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey *r)
+{
+        struct btree *b = container_of(bk, struct btree, keys);
+        unsigned i;
+        if (key_merging_disabled(b->c))
+                return false;
+        for (i = 0; i < KEY_PTRS(l); i++)
+                if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
+                    PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
+                        return false;
+        /* Keys with no pointers aren't restricted to one bucket and could
+         * overflow KEY_SIZE
+         */
+        if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) {
+                SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l));
+                SET_KEY_SIZE(l, USHRT_MAX);
+                bch_cut_front(l, r);
+                return false;
+        }
+        if (KEY_CSUM(l)) {
+                if (KEY_CSUM(r))
+                        l->ptr[KEY_PTRS(l)] = merge_chksums(l, r);
+                else
+                        SET_KEY_CSUM(l, 0);
+        }
+        SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r));
+        SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r));
+        return true;
+}
+const struct btree_keys_ops bch_extent_keys_ops = {
+        .sort_cmp       = bch_extent_sort_cmp,
+        .sort_fixup     = bch_extent_sort_fixup,
+        .insert_fixup   = bch_extent_insert_fixup,
+        .key_invalid    = bch_extent_invalid,
+        .key_bad        = bch_extent_bad,
+        .key_merge      = bch_extent_merge,
+        .key_to_text    = bch_extent_to_text,
+        .key_dump       = bch_bkey_dump,
+        .is_extents     = true,
+};
diff --git a/drivers/md/bcache/extents.h b/drivers/md/bcache/extents.h
new file mode 100644
index 000000000000..e4e23409782d
--- /dev/null
+++ b/drivers/md/bcache/extents.h
@@ -0,0 +1,13 @@
+#ifndef _BCACHE_EXTENTS_H
+#define _BCACHE_EXTENTS_H
+extern const struct btree_keys_ops bch_btree_keys_ops;
+extern const struct btree_keys_ops bch_extent_keys_ops;
+struct bkey;
+struct cache_set;
+void bch_extent_to_text(char *, size_t, const struct bkey *);
+bool __bch_btree_ptr_invalid(struct cache_set *, const struct bkey *);
+#endif /* _BCACHE_EXTENTS_H */
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 7eafdf09a0ae..18039affc306 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -44,11 +44,11 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list,
        closure_init_stack(&cl);
-        pr_debug("reading %llu", (uint64_t) bucket);
+        pr_debug("reading %u", bucket_index);
        while (offset < ca->sb.bucket_size) {
 reread:         left = ca->sb.bucket_size - offset;
-                len = min_t(unsigned, left, PAGE_SECTORS * 8);
+                len = min_t(unsigned, left, PAGE_SECTORS << JSET_BITS);
                bio_reset(bio);
                bio->bi_iter.bi_sector  = bucket + offset;
@@ -74,19 +74,28 @@ reread:		left = ca->sb.bucket_size - offset;
                        struct list_head *where;
                        size_t blocks, bytes = set_bytes(j);
-                        if (j->magic != jset_magic(&ca->sb))
+                        if (j->magic != jset_magic(&ca->sb)) {
+                                pr_debug("%u: bad magic", bucket_index);
                                return ret;
+                        }
-                        if (bytes > left << 9)
+                        if (bytes > left << 9 ||
+                            bytes > PAGE_SIZE << JSET_BITS) {
+                                pr_info("%u: too big, %zu bytes, offset %u",
+                                        bucket_index, bytes, offset);
                                return ret;
+                        }
                        if (bytes > len << 9)
                                goto reread;
-                        if (j->csum != csum_set(j))
+                        if (j->csum != csum_set(j)) {
+                                pr_info("%u: bad csum, %zu bytes, offset %u",
+                                        bucket_index, bytes, offset);
                                return ret;
+                        }
-                        blocks = set_blocks(j, ca->set);
+                        blocks = set_blocks(j, block_bytes(ca->set));
                        while (!list_empty(list)) {
                                i = list_first_entry(list,
@@ -275,7 +284,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
                }
                for (k = i->j.start;
-                     k < end(&i->j);
+                     k < bset_bkey_last(&i->j);
                     k = bkey_next(k)) {
                        unsigned j;
@@ -313,7 +322,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
                                 n, i->j.seq - 1, start, end);
                for (k = i->j.start;
-                     k < end(&i->j);
+                     k < bset_bkey_last(&i->j);
                     k = bkey_next(k)) {
                        trace_bcache_journal_replay_key(k);
@@ -555,6 +564,14 @@ static void journal_write_done(struct closure *cl)
        continue_at_nobarrier(cl, journal_write, system_wq);
 }
+static void journal_write_unlock(struct closure *cl)
+{
+        struct cache_set *c = container_of(cl, struct cache_set, journal.io);
+        c->journal.io_in_flight = 0;
+        spin_unlock(&c->journal.lock);
+}
 static void journal_write_unlocked(struct closure *cl)
        __releases(c->journal.lock)
 {
@@ -562,22 +579,15 @@ static void journal_write_unlocked(struct closure *cl)
        struct cache *ca;
        struct journal_write *w = c->journal.cur;
        struct bkey *k = &c->journal.key;
-        unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size;
+        unsigned i, sectors = set_blocks(w->data, block_bytes(c)) *
+                c->sb.block_size;
        struct bio *bio;
        struct bio_list list;
        bio_list_init(&list);
        if (!w->need_write) {
-                /*
+                closure_return_with_destructor(cl, journal_write_unlock);
-                 * XXX: have to unlock closure before we unlock journal lock,
-                 * else we race with bch_journal(). But this way we race
-                 * against cache set unregister. Doh.
-                 */
-                set_closure_fn(cl, NULL, NULL);
-                closure_sub(cl, CLOSURE_RUNNING + 1);
-                spin_unlock(&c->journal.lock);
-                return;
        } else if (journal_full(&c->journal)) {
                journal_reclaim(c);
                spin_unlock(&c->journal.lock);
@@ -586,7 +596,7 @@ static void journal_write_unlocked(struct closure *cl)
                continue_at(cl, journal_write, system_wq);
        }
-        c->journal.blocks_free -= set_blocks(w->data, c);
+        c->journal.blocks_free -= set_blocks(w->data, block_bytes(c));
        w->data->btree_level = c->root->level;
@@ -653,10 +663,12 @@ static void journal_try_write(struct cache_set *c)
        w->need_write = true;
-        if (closure_trylock(cl, &c->cl))
+        if (!c->journal.io_in_flight) {
-                journal_write_unlocked(cl);
+                c->journal.io_in_flight = 1;
-        else
+                closure_call(cl, journal_write_unlocked, NULL, &c->cl);
+        } else {
                spin_unlock(&c->journal.lock);
+        }
 }
 static struct journal_write *journal_wait_for_write(struct cache_set *c,
@@ -664,6 +676,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
 {
        size_t sectors;
        struct closure cl;
+        bool wait = false;
        closure_init_stack(&cl);
@@ -673,16 +686,19 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
                struct journal_write *w = c->journal.cur;
                sectors = __set_blocks(w->data, w->data->keys + nkeys,
-                                       c) * c->sb.block_size;
+                                       block_bytes(c)) * c->sb.block_size;
                if (sectors <= min_t(size_t,
                                     c->journal.blocks_free * c->sb.block_size,
                                     PAGE_SECTORS << JSET_BITS))
                        return w;
-                /* XXX: tracepoint */
+                if (wait)
+                        closure_wait(&c->journal.wait, &cl);
                if (!journal_full(&c->journal)) {
-                        trace_bcache_journal_entry_full(c);
+                        if (wait)
+                                trace_bcache_journal_entry_full(c);
                        /*
                         * XXX: If we were inserting so many keys that they
@@ -692,12 +708,11 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
                         */
                        BUG_ON(!w->data->keys);
-                        closure_wait(&w->wait, &cl);
                        journal_try_write(c); /* unlocks */
                } else {
-                        trace_bcache_journal_full(c);
+                        if (wait)
+                                trace_bcache_journal_full(c);
-                        closure_wait(&c->journal.wait, &cl);
                        journal_reclaim(c);
                        spin_unlock(&c->journal.lock);
@@ -706,6 +721,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
                closure_sync(&cl);
                spin_lock(&c->journal.lock);
+                wait = true;
        }
 }
@@ -736,7 +752,7 @@ atomic_t *bch_journal(struct cache_set *c,
        w = journal_wait_for_write(c, bch_keylist_nkeys(keys));
-        memcpy(end(w->data), keys->keys, bch_keylist_bytes(keys));
+        memcpy(bset_bkey_last(w->data), keys->keys, bch_keylist_bytes(keys));
        w->data->keys += bch_keylist_nkeys(keys);
        ret = &fifo_back(&c->journal.pin);
@@ -780,7 +796,6 @@ int bch_journal_alloc(struct cache_set *c)
 {
        struct journal *j = &c->journal;
-        closure_init_unlocked(&j->io);
        spin_lock_init(&j->lock);
        INIT_DELAYED_WORK(&j->work, journal_write_work);
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
index a6472fda94b2..9180c4465075 100644
--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@@ -104,6 +104,7 @@ struct journal {
        /* used when waiting because the journal was full */
        struct closure_waitlist wait;
        struct closure          io;
+        int                     io_in_flight;
        struct delayed_work     work;
        /* Number of blocks free in the bucket(s) we're currently writing to */
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 052bd24d24b4..9eb60d102de8 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -211,7 +211,7 @@ void bch_moving_gc(struct cache_set *c)
        for_each_cache(ca, c, i) {
                unsigned sectors_to_move = 0;
                unsigned reserve_sectors = ca->sb.bucket_size *
-                        min(fifo_used(&ca->free), ca->free.size / 2);
+                        fifo_used(&ca->free[RESERVE_MOVINGGC]);
                ca->heap.used = 0;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index c906571997d7..72cd213f213f 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -254,6 +254,24 @@ static void bch_data_insert_keys(struct closure *cl)
        closure_return(cl);
 }
+static int bch_keylist_realloc(struct keylist *l, unsigned u64s,
+                               struct cache_set *c)
+{
+        size_t oldsize = bch_keylist_nkeys(l);
+        size_t newsize = oldsize + u64s;
+        /*
+         * The journalling code doesn't handle the case where the keys to insert
+         * is bigger than an empty write: If we just return -ENOMEM here,
+         * bio_insert() and bio_invalidate() will insert the keys created so far
+         * and finish the rest when the keylist is empty.
+         */
+        if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
+                return -ENOMEM;
+        return __bch_keylist_realloc(l, u64s);
+}
 static void bch_data_invalidate(struct closure *cl)
 {
        struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
@@ -266,7 +284,7 @@ static void bch_data_invalidate(struct closure *cl)
                unsigned sectors = min(bio_sectors(bio),
                                       1U << (KEY_SIZE_BITS - 1));
-                if (bch_keylist_realloc(&op->insert_keys, 0, op->c))
+                if (bch_keylist_realloc(&op->insert_keys, 2, op->c))
                        goto out;
                bio->bi_iter.bi_sector  += sectors;
@@ -356,7 +374,7 @@ static void bch_data_insert_start(struct closure *cl)
                /* 1 for the device pointer and 1 for the chksum */
                if (bch_keylist_realloc(&op->insert_keys,
-                                        1 + (op->csum ? 1 : 0),
+                                        3 + (op->csum ? 1 : 0),
                                        op->c))
                        continue_at(cl, bch_data_insert_keys, bcache_wq);
@@ -596,14 +614,12 @@ struct search {
        /* Stack frame for bio_complete */
        struct closure          cl;
-        struct bcache_device    *d;
        struct bbio             bio;
        struct bio              *orig_bio;
        struct bio              *cache_miss;
+        struct bcache_device    *d;
        unsigned                insert_bio_sectors;
        unsigned                recoverable:1;
        unsigned                write:1;
        unsigned                read_dirty_data:1;
@@ -629,7 +645,8 @@ static void bch_cache_read_endio(struct bio *bio, int error)
        if (error)
                s->iop.error = error;
-        else if (ptr_stale(s->iop.c, &b->key, 0)) {
+        else if (!KEY_DIRTY(&b->key) &&
+                 ptr_stale(s->iop.c, &b->key, 0)) {
                atomic_long_inc(&s->iop.c->cache_read_races);
                s->iop.error = -EINTR;
        }
@@ -710,10 +727,13 @@ static void cache_lookup(struct closure *cl)
 {
        struct search *s = container_of(cl, struct search, iop.cl);
        struct bio *bio = &s->bio.bio;
+        int ret;
+        bch_btree_op_init(&s->op, -1);
-        int ret = bch_btree_map_keys(&s->op, s->iop.c,
+        ret = bch_btree_map_keys(&s->op, s->iop.c,
-                                     &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0),
+                                 &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0),
-                                     cache_lookup_fn, MAP_END_KEY);
+                                 cache_lookup_fn, MAP_END_KEY);
        if (ret == -EAGAIN)
                continue_at(cl, cache_lookup, bcache_wq);
@@ -754,12 +774,12 @@ static void bio_complete(struct search *s)
        }
 }
-static void do_bio_hook(struct search *s)
+static void do_bio_hook(struct search *s, struct bio *orig_bio)
 {
        struct bio *bio = &s->bio.bio;
        bio_init(bio);
-        __bio_clone_fast(bio, s->orig_bio);
+        __bio_clone_fast(bio, orig_bio);
        bio->bi_end_io          = request_endio;
        bio->bi_private         = &s->cl;
@@ -778,26 +798,32 @@ static void search_free(struct closure *cl)
        mempool_free(s, s->d->c->search);
 }
-static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
+static inline struct search *search_alloc(struct bio *bio,
+                                          struct bcache_device *d)
 {
        struct search *s;
        s = mempool_alloc(d->c->search, GFP_NOIO);
-        memset(s, 0, offsetof(struct search, iop.insert_keys));
-        __closure_init(&s->cl, NULL);
+        closure_init(&s->cl, NULL);
+        do_bio_hook(s, bio);
-        s->iop.inode            = d->id;
-        s->iop.c                = d->c;
-        s->d                    = d;
-        s->op.lock              = -1;
-        s->iop.write_point      = hash_long((unsigned long) current, 16);
        s->orig_bio             = bio;
-        s->write                = (bio->bi_rw & REQ_WRITE) != 0;
+        s->cache_miss           = NULL;
-        s->iop.flush_journal    = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
+        s->d                    = d;
        s->recoverable          = 1;
+        s->write                = (bio->bi_rw & REQ_WRITE) != 0;
+        s->read_dirty_data      = 0;
        s->start_time           = jiffies;
-        do_bio_hook(s);
+        s->iop.c                = d->c;
+        s->iop.bio              = NULL;
+        s->iop.inode            = d->id;
+        s->iop.write_point      = hash_long((unsigned long) current, 16);
+        s->iop.write_prio       = 0;
+        s->iop.error            = 0;
+        s->iop.flags            = 0;
+        s->iop.flush_journal    = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
        return s;
 }
@@ -843,7 +869,7 @@ static void cached_dev_read_error(struct closure *cl)
                trace_bcache_read_retry(s->orig_bio);
                s->iop.error = 0;
-                do_bio_hook(s);
+                do_bio_hook(s, s->orig_bio);
                /* XXX: invalidate cache */
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 2cd65bf073c2..39f21dbedc38 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -13,17 +13,22 @@ struct data_insert_op {
        uint16_t                write_prio;
        short                   error;
-        unsigned                bypass:1;
+        union {
-        unsigned                writeback:1;
+                uint16_t        flags;
-        unsigned                flush_journal:1;
-        unsigned                csum:1;
-        unsigned                replace:1;
+        struct {
-        unsigned                replace_collision:1;
+                unsigned        bypass:1;
+                unsigned        writeback:1;
+                unsigned        flush_journal:1;
+                unsigned        csum:1;
-        unsigned                insert_data_done:1;
+                unsigned        replace:1;
+                unsigned        replace_collision:1;
+                unsigned        insert_data_done:1;
+        };
+        };
-        /* Anything past this point won't get zeroed in search_alloc() */
        struct keylist          insert_keys;
        BKEY_PADDED(replace_key);
 };
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 93d593f957f6..24a3a1546caa 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -9,6 +9,7 @@
 #include "bcache.h"
 #include "btree.h"
 #include "debug.h"
+#include "extents.h"
 #include "request.h"
 #include "writeback.h"
@@ -225,7 +226,7 @@ static void write_bdev_super_endio(struct bio *bio, int error)
        struct cached_dev *dc = bio->bi_private;
        /* XXX: error checking */
-        closure_put(&dc->sb_write.cl);
+        closure_put(&dc->sb_write);
 }
 static void __write_super(struct cache_sb *sb, struct bio *bio)
@@ -263,12 +264,20 @@ static void __write_super(struct cache_sb *sb, struct bio *bio)
        submit_bio(REQ_WRITE, bio);
 }
+static void bch_write_bdev_super_unlock(struct closure *cl)
+{
+        struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
+        up(&dc->sb_write_mutex);
+}
 void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
 {
-        struct closure *cl = &dc->sb_write.cl;
+        struct closure *cl = &dc->sb_write;
        struct bio *bio = &dc->sb_bio;
-        closure_lock(&dc->sb_write, parent);
+        down(&dc->sb_write_mutex);
+        closure_init(cl, parent);
        bio_reset(bio);
        bio->bi_bdev    = dc->bdev;
@@ -278,7 +287,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
        closure_get(cl);
        __write_super(&dc->sb, bio);
-        closure_return(cl);
+        closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
 }
 static void write_super_endio(struct bio *bio, int error)
@@ -286,16 +295,24 @@ static void write_super_endio(struct bio *bio, int error)
        struct cache *ca = bio->bi_private;
        bch_count_io_errors(ca, error, "writing superblock");
-        closure_put(&ca->set->sb_write.cl);
+        closure_put(&ca->set->sb_write);
+}
+static void bcache_write_super_unlock(struct closure *cl)
+{
+        struct cache_set *c = container_of(cl, struct cache_set, sb_write);
+        up(&c->sb_write_mutex);
 }
 void bcache_write_super(struct cache_set *c)
 {
-        struct closure *cl = &c->sb_write.cl;
+        struct closure *cl = &c->sb_write;
        struct cache *ca;
        unsigned i;
-        closure_lock(&c->sb_write, &c->cl);
+        down(&c->sb_write_mutex);
+        closure_init(cl, &c->cl);
        c->sb.seq++;
@@ -317,7 +334,7 @@ void bcache_write_super(struct cache_set *c)
                __write_super(&ca->sb, bio);
        }
-        closure_return(cl);
+        closure_return_with_destructor(cl, bcache_write_super_unlock);
 }
 /* UUID io */
@@ -325,23 +342,31 @@ void bcache_write_super(struct cache_set *c)
 static void uuid_endio(struct bio *bio, int error)
 {
        struct closure *cl = bio->bi_private;
-        struct cache_set *c = container_of(cl, struct cache_set, uuid_write.cl);
+        struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
        cache_set_err_on(error, c, "accessing uuids");
        bch_bbio_free(bio, c);
        closure_put(cl);
 }
+static void uuid_io_unlock(struct closure *cl)
+{
+        struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
+        up(&c->uuid_write_mutex);
+}
 static void uuid_io(struct cache_set *c, unsigned long rw,
                    struct bkey *k, struct closure *parent)
 {
-        struct closure *cl = &c->uuid_write.cl;
+        struct closure *cl = &c->uuid_write;
        struct uuid_entry *u;
        unsigned i;
        char buf[80];
        BUG_ON(!parent);
-        closure_lock(&c->uuid_write, parent);
+        down(&c->uuid_write_mutex);
+        closure_init(cl, parent);
        for (i = 0; i < KEY_PTRS(k); i++) {
                struct bio *bio = bch_bbio_alloc(c);
@@ -359,7 +384,7 @@ static void uuid_io(struct cache_set *c, unsigned long rw,
                        break;
        }
-        bch_bkey_to_text(buf, sizeof(buf), k);
+        bch_extent_to_text(buf, sizeof(buf), k);
        pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf);
        for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
@@ -368,14 +393,14 @@ static void uuid_io(struct cache_set *c, unsigned long rw,
                                 u - c->uuids, u->uuid, u->label,
                                 u->first_reg, u->last_reg, u->invalidated);
-        closure_return(cl);
+        closure_return_with_destructor(cl, uuid_io_unlock);
 }
 static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
 {
        struct bkey *k = &j->uuid_bucket;
-        if (bch_btree_ptr_invalid(c, k))
+        if (__bch_btree_ptr_invalid(c, k))
                return "bad uuid pointer";
        bkey_copy(&c->uuid_bucket, k);
@@ -420,7 +445,7 @@ static int __uuid_write(struct cache_set *c)
        lockdep_assert_held(&bch_register_lock);
-        if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true))
+        if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true))
                return 1;
        SET_KEY_SIZE(&k.key, c->sb.bucket_size);
@@ -538,8 +563,8 @@ void bch_prio_write(struct cache *ca)
        atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
                        &ca->meta_sectors_written);
-        pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
+        //pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
-                 fifo_used(&ca->free_inc), fifo_used(&ca->unused));
+        //       fifo_used(&ca->free_inc), fifo_used(&ca->unused));
        for (i = prio_buckets(ca) - 1; i >= 0; --i) {
                long bucket;
@@ -558,7 +583,7 @@ void bch_prio_write(struct cache *ca)
                p->magic        = pset_magic(&ca->sb);
                p->csum         = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
-                bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, true);
+                bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true);
                BUG_ON(bucket == -1);
                mutex_unlock(&ca->set->bucket_lock);
@@ -1098,7 +1123,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
        set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
        kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
        INIT_WORK(&dc->detach, cached_dev_detach_finish);
-        closure_init_unlocked(&dc->sb_write);
+        sema_init(&dc->sb_write_mutex, 1);
        INIT_LIST_HEAD(&dc->io_lru);
        spin_lock_init(&dc->io_lock);
        bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
@@ -1110,6 +1135,12 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
                hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
        }
+        dc->disk.stripe_size = q->limits.io_opt >> 9;
+        if (dc->disk.stripe_size)
+                dc->partial_stripes_expensive =
+                        q->limits.raid_partial_stripes_expensive;
        ret = bcache_device_init(&dc->disk, block_size,
                         dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
        if (ret)
@@ -1321,8 +1352,8 @@ static void cache_set_free(struct closure *cl)
                if (ca)
                        kobject_put(&ca->kobj);
+        bch_bset_sort_state_free(&c->sort);
        free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
-        free_pages((unsigned long) c->sort, ilog2(bucket_pages(c)));
        if (c->bio_split)
                bioset_free(c->bio_split);
@@ -1447,21 +1478,17 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
        c->block_bits           = ilog2(sb->block_size);
        c->nr_uuids             = bucket_bytes(c) / sizeof(struct uuid_entry);
-        c->btree_pages          = c->sb.bucket_size / PAGE_SECTORS;
+        c->btree_pages          = bucket_pages(c);
        if (c->btree_pages > BTREE_MAX_PAGES)
                c->btree_pages = max_t(int, c->btree_pages / 4,
                                       BTREE_MAX_PAGES);
-        c->sort_crit_factor = int_sqrt(c->btree_pages);
+        sema_init(&c->sb_write_mutex, 1);
-        closure_init_unlocked(&c->sb_write);
        mutex_init(&c->bucket_lock);
        init_waitqueue_head(&c->try_wait);
        init_waitqueue_head(&c->bucket_wait);
-        closure_init_unlocked(&c->uuid_write);
+        sema_init(&c->uuid_write_mutex, 1);
-        mutex_init(&c->sort_lock);
-        spin_lock_init(&c->sort_time.lock);
        spin_lock_init(&c->btree_gc_time.lock);
        spin_lock_init(&c->btree_split_time.lock);
        spin_lock_init(&c->btree_read_time.lock);
@@ -1489,11 +1516,11 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
                                bucket_pages(c))) ||
            !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
            !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
-            !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) ||
            !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
            bch_journal_alloc(c) ||
            bch_btree_cache_alloc(c) ||
-            bch_open_buckets_alloc(c))
+            bch_open_buckets_alloc(c) ||
+            bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
                goto err;
        c->congested_read_threshold_us  = 2000;
@@ -1549,7 +1576,7 @@ static void run_cache_set(struct cache_set *c)
                k = &j->btree_root;
                err = "bad btree root";
-                if (bch_btree_ptr_invalid(c, k))
+                if (__bch_btree_ptr_invalid(c, k))
                        goto err;
                err = "error reading btree root";
@@ -1743,6 +1770,7 @@ err:
 void bch_cache_release(struct kobject *kobj)
 {
        struct cache *ca = container_of(kobj, struct cache, kobj);
+        unsigned i;
        if (ca->set)
                ca->set->cache[ca->sb.nr_this_dev] = NULL;
@@ -1756,7 +1784,9 @@ void bch_cache_release(struct kobject *kobj)
        free_heap(&ca->heap);
        free_fifo(&ca->unused);
        free_fifo(&ca->free_inc);
-        free_fifo(&ca->free);
+        for (i = 0; i < RESERVE_NR; i++)
+                free_fifo(&ca->free[i]);
        if (ca->sb_bio.bi_inline_vecs[0].bv_page)
                put_page(ca->sb_bio.bi_io_vec[0].bv_page);
@@ -1782,10 +1812,12 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
        ca->journal.bio.bi_max_vecs = 8;
        ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs;
-        free = roundup_pow_of_two(ca->sb.nbuckets) >> 9;
+        free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
-        free = max_t(size_t, free, (prio_buckets(ca) + 8) * 2);
-        if (!init_fifo(&ca->free,       free, GFP_KERNEL) ||
+        if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) ||
+            !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
+            !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
+            !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
            !init_fifo(&ca->free_inc,   free << 2, GFP_KERNEL) ||
            !init_fifo(&ca->unused,     free << 2, GFP_KERNEL) ||
            !init_heap(&ca->heap,       free << 3, GFP_KERNEL) ||
@@ -2030,7 +2062,8 @@ static void bcache_exit(void)
                kobject_put(bcache_kobj);
        if (bcache_wq)
                destroy_workqueue(bcache_wq);
-        unregister_blkdev(bcache_major, "bcache");
+        if (bcache_major)
+                unregister_blkdev(bcache_major, "bcache");
        unregister_reboot_notifier(&reboot);
 }
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index a1f85612f0b3..c6ab69333a6d 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -102,7 +102,6 @@ rw_attribute(bypass_torture_test);
 rw_attribute(key_merging_disabled);
 rw_attribute(gc_always_rewrite);
 rw_attribute(expensive_debug_checks);
-rw_attribute(freelist_percent);
 rw_attribute(cache_replacement_policy);
 rw_attribute(btree_shrinker_disabled);
 rw_attribute(copy_gc_enabled);
@@ -401,6 +400,48 @@ static struct attribute *bch_flash_dev_files[] = {
 };
 KTYPE(bch_flash_dev);
+struct bset_stats_op {
+        struct btree_op op;
+        size_t nodes;
+        struct bset_stats stats;
+};
+static int btree_bset_stats(struct btree_op *b_op, struct btree *b)
+{
+        struct bset_stats_op *op = container_of(b_op, struct bset_stats_op, op);
+        op->nodes++;
+        bch_btree_keys_stats(&b->keys, &op->stats);
+        return MAP_CONTINUE;
+}
+int bch_bset_print_stats(struct cache_set *c, char *buf)
+{
+        struct bset_stats_op op;
+        int ret;
+        memset(&op, 0, sizeof(op));
+        bch_btree_op_init(&op.op, -1);
+        ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, btree_bset_stats);
+        if (ret < 0)
+                return ret;
+        return snprintf(buf, PAGE_SIZE,
+                        "btree nodes:           %zu\n"
+                        "written sets:          %zu\n"
+                        "unwritten sets:                %zu\n"
+                        "written key bytes:     %zu\n"
+                        "unwritten key bytes:   %zu\n"
+                        "floats:                        %zu\n"
+                        "failed:                        %zu\n",
+                        op.nodes,
+                        op.stats.sets_written, op.stats.sets_unwritten,
+                        op.stats.bytes_written, op.stats.bytes_unwritten,
+                        op.stats.floats, op.stats.failed);
+}
 SHOW(__bch_cache_set)
 {
        unsigned root_usage(struct cache_set *c)
@@ -419,7 +460,7 @@ lock_root:
                        rw_lock(false, b, b->level);
                } while (b != c->root);
-                for_each_key_filter(b, k, &iter, bch_ptr_bad)
+                for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
                        bytes += bkey_bytes(k);
                rw_unlock(false, b);
@@ -434,7 +475,7 @@ lock_root:
                mutex_lock(&c->bucket_lock);
                list_for_each_entry(b, &c->btree_cache, list)
-                        ret += 1 << (b->page_order + PAGE_SHIFT);
+                        ret += 1 << (b->keys.page_order + PAGE_SHIFT);
                mutex_unlock(&c->bucket_lock);
                return ret;
@@ -491,7 +532,7 @@ lock_root:
        sysfs_print_time_stats(&c->btree_gc_time,       btree_gc, sec, ms);
        sysfs_print_time_stats(&c->btree_split_time,    btree_split, sec, us);
-        sysfs_print_time_stats(&c->sort_time,           btree_sort, ms, us);
+        sysfs_print_time_stats(&c->sort.time,           btree_sort, ms, us);
        sysfs_print_time_stats(&c->btree_read_time,     btree_read, ms, us);
        sysfs_print_time_stats(&c->try_harder_time,     try_harder, ms, us);
@@ -711,9 +752,6 @@ SHOW(__bch_cache)
        sysfs_print(io_errors,
                    atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT);
-        sysfs_print(freelist_percent, ca->free.size * 100 /
-                    ((size_t) ca->sb.nbuckets));
        if (attr == &sysfs_cache_replacement_policy)
                return bch_snprint_string_list(buf, PAGE_SIZE,
                                               cache_replacement_policies,
@@ -820,32 +858,6 @@ STORE(__bch_cache)
                }
        }
-        if (attr == &sysfs_freelist_percent) {
-                DECLARE_FIFO(long, free);
-                long i;
-                size_t p = strtoul_or_return(buf);
-                p = clamp_t(size_t,
-                            ((size_t) ca->sb.nbuckets * p) / 100,
-                            roundup_pow_of_two(ca->sb.nbuckets) >> 9,
-                            ca->sb.nbuckets / 2);
-                if (!init_fifo_exact(&free, p, GFP_KERNEL))
-                        return -ENOMEM;
-                mutex_lock(&ca->set->bucket_lock);
-                fifo_move(&free, &ca->free);
-                fifo_swap(&free, &ca->free);
-                mutex_unlock(&ca->set->bucket_lock);
-                while (fifo_pop(&free, i))
-                        atomic_dec(&ca->buckets[i].pin);
-                free_fifo(&free);
-        }
        if (attr == &sysfs_clear_stats) {
                atomic_long_set(&ca->sectors_written, 0);
                atomic_long_set(&ca->btree_sectors_written, 0);
@@ -869,7 +881,6 @@ static struct attribute *bch_cache_files[] = {
        &sysfs_metadata_written,
        &sysfs_io_errors,
        &sysfs_clear_stats,
-        &sysfs_freelist_percent,
        &sysfs_cache_replacement_policy,
        NULL
 };
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 1030c6020e98..ac7d0d1f70d7 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -2,6 +2,7 @@
 #ifndef _BCACHE_UTIL_H
 #define _BCACHE_UTIL_H
+#include <linux/blkdev.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/llist.h>
@@ -17,11 +18,13 @@ struct closure;
 #ifdef CONFIG_BCACHE_DEBUG
+#define EBUG_ON(cond)                   BUG_ON(cond)
 #define atomic_dec_bug(v)       BUG_ON(atomic_dec_return(v) < 0)
 #define atomic_inc_bug(v, i)    BUG_ON(atomic_inc_return(v) <= i)
 #else /* DEBUG */
+#define EBUG_ON(cond)                   do { if (cond); } while (0)
 #define atomic_dec_bug(v)       atomic_dec(v)
 #define atomic_inc_bug(v, i)    atomic_inc(v)
@@ -391,6 +394,11 @@ struct time_stats {
 void bch_time_stats_update(struct time_stats *stats, uint64_t time);
+static inline unsigned local_clock_us(void)
+{
+        return local_clock() >> 10;
+}
 #define NSEC_PER_ns                     1L
 #define NSEC_PER_us                     NSEC_PER_USEC
 #define NSEC_PER_ms                     NSEC_PER_MSEC
author	Linus Torvalds <torvalds@linux-foundation.org>	2014-01-30 14:40:10 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-01-30 14:40:10 -0500
commit	53d8ab29f8f6d67e37857b68189b38fa3d87dd8e (patch)
tree	3c770b58f0404c67b1b084f626dcafa8464c7512 /drivers/md/bcache
parent	f568849edac8611d603e00bd6cbbcfea09395ae6 (diff)
parent	14424be4dbfa127001ad623869f7ee4c7635e991 (diff)