bcache: A block layer cache

Does writethrough and writeback caching, handles unclean shutdown, and has a bunch of other nifty features motivated by real world usage. See the wiki at http://bcache.evilpiepirate.org for more. Signed-off-by: Kent Overstreet <koverstreet@google.com>
author: Kent Overstreet <koverstreet@google.com> 2013-03-23 19:11:31 -0400
committer: Kent Overstreet <koverstreet@google.com> 2013-03-23 19:11:31 -0400
commit: cafe563591446cf80bfbc2fe3bc72a2e36cf1060 (patch)
tree: c8ae27b13dcdb0219634376ca5e667df32b1173a /drivers/md/bcache/alloc.c
parent: ea6749c705d9e629ed03c7336cc929fc6014b834 (diff)
1 files changed, 583 insertions, 0 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
new file mode 100644
index 000000000000..ed18115e078e
--- /dev/null
+++ b/drivers/md/bcache/alloc.c
@@ -0,0 +1,583 @@
+/*
+ * Primary bucket allocation code
+ *
+ * Copyright 2012 Google, Inc.
+ *
+ * Allocation in bcache is done in terms of buckets:
+ *
+ * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
+ * btree pointers - they must match for the pointer to be considered valid.
+ *
+ * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
+ * bucket simply by incrementing its gen.
+ *
+ * The gens (along with the priorities; it's really the gens are important but
+ * the code is named as if it's the priorities) are written in an arbitrary list
+ * of buckets on disk, with a pointer to them in the journal header.
+ *
+ * When we invalidate a bucket, we have to write its new gen to disk and wait
+ * for that write to complete before we use it - otherwise after a crash we
+ * could have pointers that appeared to be good but pointed to data that had
+ * been overwritten.
+ *
+ * Since the gens and priorities are all stored contiguously on disk, we can
+ * batch this up: We fill up the free_inc list with freshly invalidated buckets,
+ * call prio_write(), and when prio_write() finishes we pull buckets off the
+ * free_inc list and optionally discard them.
+ *
+ * free_inc isn't the only freelist - if it was, we'd often to sleep while
+ * priorities and gens were being written before we could allocate. c->free is a
+ * smaller freelist, and buckets on that list are always ready to be used.
+ *
+ * If we've got discards enabled, that happens when a bucket moves from the
+ * free_inc list to the free list.
+ *
+ * There is another freelist, because sometimes we have buckets that we know
+ * have nothing pointing into them - these we can reuse without waiting for
+ * priorities to be rewritten. These come from freed btree nodes and buckets
+ * that garbage collection discovered no longer had valid keys pointing into
+ * them (because they were overwritten). That's the unused list - buckets on the
+ * unused list move to the free list, optionally being discarded in the process.
+ *
+ * It's also important to ensure that gens don't wrap around - with respect to
+ * either the oldest gen in the btree or the gen on disk. This is quite
+ * difficult to do in practice, but we explicitly guard against it anyways - if
+ * a bucket is in danger of wrapping around we simply skip invalidating it that
+ * time around, and we garbage collect or rewrite the priorities sooner than we
+ * would have otherwise.
+ *
+ * bch_bucket_alloc() allocates a single bucket from a specific cache.
+ *
+ * bch_bucket_alloc_set() allocates one or more buckets from different caches
+ * out of a cache set.
+ *
+ * free_some_buckets() drives all the processes described above. It's called
+ * from bch_bucket_alloc() and a few other places that need to make sure free
+ * buckets are ready.
+ *
+ * invalidate_buckets_(lru|fifo)() find buckets that are available to be
+ * invalidated, and then invalidate them and stick them on the free_inc list -
+ * in either lru or fifo order.
+ */
+#include "bcache.h"
+#include "btree.h"
+#include <linux/random.h>
+#define MAX_IN_FLIGHT_DISCARDS          8U
+/* Bucket heap / gen */
+uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
+{
+        uint8_t ret = ++b->gen;
+        ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b));
+        WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX);
+        if (CACHE_SYNC(&ca->set->sb)) {
+                ca->need_save_prio = max(ca->need_save_prio,
+                                         bucket_disk_gen(b));
+                WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX);
+        }
+        return ret;
+}
+void bch_rescale_priorities(struct cache_set *c, int sectors)
+{
+        struct cache *ca;
+        struct bucket *b;
+        unsigned next = c->nbuckets * c->sb.bucket_size / 1024;
+        unsigned i;
+        int r;
+        atomic_sub(sectors, &c->rescale);
+        do {
+                r = atomic_read(&c->rescale);
+                if (r >= 0)
+                        return;
+        } while (atomic_cmpxchg(&c->rescale, r, r + next) != r);
+        mutex_lock(&c->bucket_lock);
+        c->min_prio = USHRT_MAX;
+        for_each_cache(ca, c, i)
+                for_each_bucket(b, ca)
+                        if (b->prio &&
+                            b->prio != BTREE_PRIO &&
+                            !atomic_read(&b->pin)) {
+                                b->prio--;
+                                c->min_prio = min(c->min_prio, b->prio);
+                        }
+        mutex_unlock(&c->bucket_lock);
+}
+/* Discard/TRIM */
+struct discard {
+        struct list_head        list;
+        struct work_struct      work;
+        struct cache            *ca;
+        long                    bucket;
+        struct bio              bio;
+        struct bio_vec          bv;
+};
+static void discard_finish(struct work_struct *w)
+{
+        struct discard *d = container_of(w, struct discard, work);
+        struct cache *ca = d->ca;
+        char buf[BDEVNAME_SIZE];
+        if (!test_bit(BIO_UPTODATE, &d->bio.bi_flags)) {
+                pr_notice("discard error on %s, disabling",
+                         bdevname(ca->bdev, buf));
+                d->ca->discard = 0;
+        }
+        mutex_lock(&ca->set->bucket_lock);
+        fifo_push(&ca->free, d->bucket);
+        list_add(&d->list, &ca->discards);
+        atomic_dec(&ca->discards_in_flight);
+        mutex_unlock(&ca->set->bucket_lock);
+        closure_wake_up(&ca->set->bucket_wait);
+        wake_up(&ca->set->alloc_wait);
+        closure_put(&ca->set->cl);
+}
+static void discard_endio(struct bio *bio, int error)
+{
+        struct discard *d = container_of(bio, struct discard, bio);
+        schedule_work(&d->work);
+}
+static void do_discard(struct cache *ca, long bucket)
+{
+        struct discard *d = list_first_entry(&ca->discards,
+                                             struct discard, list);
+        list_del(&d->list);
+        d->bucket = bucket;
+        atomic_inc(&ca->discards_in_flight);
+        closure_get(&ca->set->cl);
+        bio_init(&d->bio);
+        d->bio.bi_sector        = bucket_to_sector(ca->set, d->bucket);
+        d->bio.bi_bdev          = ca->bdev;
+        d->bio.bi_rw            = REQ_WRITE|REQ_DISCARD;
+        d->bio.bi_max_vecs      = 1;
+        d->bio.bi_io_vec        = d->bio.bi_inline_vecs;
+        d->bio.bi_size          = bucket_bytes(ca);
+        d->bio.bi_end_io        = discard_endio;
+        bio_set_prio(&d->bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+        submit_bio(0, &d->bio);
+}
+/* Allocation */
+static inline bool can_inc_bucket_gen(struct bucket *b)
+{
+        return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX &&
+                bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX;
+}
+bool bch_bucket_add_unused(struct cache *ca, struct bucket *b)
+{
+        BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b));
+        if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] &&
+            CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO)
+                return false;
+        b->prio = 0;
+        if (can_inc_bucket_gen(b) &&
+            fifo_push(&ca->unused, b - ca->buckets)) {
+                atomic_inc(&b->pin);
+                return true;
+        }
+        return false;
+}
+static bool can_invalidate_bucket(struct cache *ca, struct bucket *b)
+{
+        return GC_MARK(b) == GC_MARK_RECLAIMABLE &&
+                !atomic_read(&b->pin) &&
+                can_inc_bucket_gen(b);
+}
+static void invalidate_one_bucket(struct cache *ca, struct bucket *b)
+{
+        bch_inc_gen(ca, b);
+        b->prio = INITIAL_PRIO;
+        atomic_inc(&b->pin);
+        fifo_push(&ca->free_inc, b - ca->buckets);
+}
+static void invalidate_buckets_lru(struct cache *ca)
+{
+        unsigned bucket_prio(struct bucket *b)
+        {
+                return ((unsigned) (b->prio - ca->set->min_prio)) *
+                        GC_SECTORS_USED(b);
+        }
+        bool bucket_max_cmp(struct bucket *l, struct bucket *r)
+        {
+                return bucket_prio(l) < bucket_prio(r);
+        }
+        bool bucket_min_cmp(struct bucket *l, struct bucket *r)
+        {
+                return bucket_prio(l) > bucket_prio(r);
+        }
+        struct bucket *b;
+        ssize_t i;
+        ca->heap.used = 0;
+        for_each_bucket(b, ca) {
+                if (!can_invalidate_bucket(ca, b))
+                        continue;
+                if (!GC_SECTORS_USED(b)) {
+                        if (!bch_bucket_add_unused(ca, b))
+                                return;
+                } else {
+                        if (!heap_full(&ca->heap))
+                                heap_add(&ca->heap, b, bucket_max_cmp);
+                        else if (bucket_max_cmp(b, heap_peek(&ca->heap))) {
+                                ca->heap.data[0] = b;
+                                heap_sift(&ca->heap, 0, bucket_max_cmp);
+                        }
+                }
+        }
+        if (ca->heap.used * 2 < ca->heap.size)
+                bch_queue_gc(ca->set);
+        for (i = ca->heap.used / 2 - 1; i >= 0; --i)
+                heap_sift(&ca->heap, i, bucket_min_cmp);
+        while (!fifo_full(&ca->free_inc)) {
+                if (!heap_pop(&ca->heap, b, bucket_min_cmp)) {
+                        /* We don't want to be calling invalidate_buckets()
+                         * multiple times when it can't do anything
+                         */
+                        ca->invalidate_needs_gc = 1;
+                        bch_queue_gc(ca->set);
+                        return;
+                }
+                invalidate_one_bucket(ca, b);
+        }
+}
+static void invalidate_buckets_fifo(struct cache *ca)
+{
+        struct bucket *b;
+        size_t checked = 0;
+        while (!fifo_full(&ca->free_inc)) {
+                if (ca->fifo_last_bucket <  ca->sb.first_bucket ||
+                    ca->fifo_last_bucket >= ca->sb.nbuckets)
+                        ca->fifo_last_bucket = ca->sb.first_bucket;
+                b = ca->buckets + ca->fifo_last_bucket++;
+                if (can_invalidate_bucket(ca, b))
+                        invalidate_one_bucket(ca, b);
+                if (++checked >= ca->sb.nbuckets) {
+                        ca->invalidate_needs_gc = 1;
+                        bch_queue_gc(ca->set);
+                        return;
+                }
+        }
+}
+static void invalidate_buckets_random(struct cache *ca)
+{
+        struct bucket *b;
+        size_t checked = 0;
+        while (!fifo_full(&ca->free_inc)) {
+                size_t n;
+                get_random_bytes(&n, sizeof(n));
+                n %= (size_t) (ca->sb.nbuckets - ca->sb.first_bucket);
+                n += ca->sb.first_bucket;
+                b = ca->buckets + n;
+                if (can_invalidate_bucket(ca, b))
+                        invalidate_one_bucket(ca, b);
+                if (++checked >= ca->sb.nbuckets / 2) {
+                        ca->invalidate_needs_gc = 1;
+                        bch_queue_gc(ca->set);
+                        return;
+                }
+        }
+}
+static void invalidate_buckets(struct cache *ca)
+{
+        if (ca->invalidate_needs_gc)
+                return;
+        switch (CACHE_REPLACEMENT(&ca->sb)) {
+        case CACHE_REPLACEMENT_LRU:
+                invalidate_buckets_lru(ca);
+                break;
+        case CACHE_REPLACEMENT_FIFO:
+                invalidate_buckets_fifo(ca);
+                break;
+        case CACHE_REPLACEMENT_RANDOM:
+                invalidate_buckets_random(ca);
+                break;
+        }
+}
+#define allocator_wait(ca, cond)                                        \
+do {                                                                    \
+        DEFINE_WAIT(__wait);                                            \
+                                                                        \
+        while (!(cond)) {                                               \
+                prepare_to_wait(&ca->set->alloc_wait,                   \
+                                &__wait, TASK_INTERRUPTIBLE);           \
+                                                                        \
+                mutex_unlock(&(ca)->set->bucket_lock);                  \
+                if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) {  \
+                        finish_wait(&ca->set->alloc_wait, &__wait);     \
+                        closure_return(cl);                             \
+                }                                                       \
+                                                                        \
+                schedule();                                             \
+                __set_current_state(TASK_RUNNING);                      \
+                mutex_lock(&(ca)->set->bucket_lock);                    \
+        }                                                               \
+                                                                        \
+        finish_wait(&ca->set->alloc_wait, &__wait);                     \
+} while (0)
+void bch_allocator_thread(struct closure *cl)
+{
+        struct cache *ca = container_of(cl, struct cache, alloc);
+        mutex_lock(&ca->set->bucket_lock);
+        while (1) {
+                while (1) {
+                        long bucket;
+                        if ((!atomic_read(&ca->set->prio_blocked) ||
+                             !CACHE_SYNC(&ca->set->sb)) &&
+                            !fifo_empty(&ca->unused))
+                                fifo_pop(&ca->unused, bucket);
+                        else if (!fifo_empty(&ca->free_inc))
+                                fifo_pop(&ca->free_inc, bucket);
+                        else
+                                break;
+                        allocator_wait(ca, (int) fifo_free(&ca->free) >
+                                       atomic_read(&ca->discards_in_flight));
+                        if (ca->discard) {
+                                allocator_wait(ca, !list_empty(&ca->discards));
+                                do_discard(ca, bucket);
+                        } else {
+                                fifo_push(&ca->free, bucket);
+                                closure_wake_up(&ca->set->bucket_wait);
+                        }
+                }
+                allocator_wait(ca, ca->set->gc_mark_valid);
+                invalidate_buckets(ca);
+                allocator_wait(ca, !atomic_read(&ca->set->prio_blocked) ||
+                               !CACHE_SYNC(&ca->set->sb));
+                if (CACHE_SYNC(&ca->set->sb) &&
+                    (!fifo_empty(&ca->free_inc) ||
+                     ca->need_save_prio > 64)) {
+                        bch_prio_write(ca);
+                }
+        }
+}
+long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl)
+{
+        long r = -1;
+again:
+        wake_up(&ca->set->alloc_wait);
+        if (fifo_used(&ca->free) > ca->watermark[watermark] &&
+            fifo_pop(&ca->free, r)) {
+                struct bucket *b = ca->buckets + r;
+#ifdef CONFIG_BCACHE_EDEBUG
+                size_t iter;
+                long i;
+                for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
+                        BUG_ON(ca->prio_buckets[iter] == (uint64_t) r);
+                fifo_for_each(i, &ca->free, iter)
+                        BUG_ON(i == r);
+                fifo_for_each(i, &ca->free_inc, iter)
+                        BUG_ON(i == r);
+                fifo_for_each(i, &ca->unused, iter)
+                        BUG_ON(i == r);
+#endif
+                BUG_ON(atomic_read(&b->pin) != 1);
+                SET_GC_SECTORS_USED(b, ca->sb.bucket_size);
+                if (watermark <= WATERMARK_METADATA) {
+                        SET_GC_MARK(b, GC_MARK_METADATA);
+                        b->prio = BTREE_PRIO;
+                } else {
+                        SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
+                        b->prio = INITIAL_PRIO;
+                }
+                return r;
+        }
+        pr_debug("alloc failure: blocked %i free %zu free_inc %zu unused %zu",
+                 atomic_read(&ca->set->prio_blocked), fifo_used(&ca->free),
+                 fifo_used(&ca->free_inc), fifo_used(&ca->unused));
+        if (cl) {
+                closure_wait(&ca->set->bucket_wait, cl);
+                if (closure_blocking(cl)) {
+                        mutex_unlock(&ca->set->bucket_lock);
+                        closure_sync(cl);
+                        mutex_lock(&ca->set->bucket_lock);
+                        goto again;
+                }
+        }
+        return -1;
+}
+void bch_bucket_free(struct cache_set *c, struct bkey *k)
+{
+        unsigned i;
+        for (i = 0; i < KEY_PTRS(k); i++) {
+                struct bucket *b = PTR_BUCKET(c, k, i);
+                SET_GC_MARK(b, 0);
+                SET_GC_SECTORS_USED(b, 0);
+                bch_bucket_add_unused(PTR_CACHE(c, k, i), b);
+        }
+}
+int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
+                           struct bkey *k, int n, struct closure *cl)
+{
+        int i;
+        lockdep_assert_held(&c->bucket_lock);
+        BUG_ON(!n || n > c->caches_loaded || n > 8);
+        bkey_init(k);
+        /* sort by free space/prio of oldest data in caches */
+        for (i = 0; i < n; i++) {
+                struct cache *ca = c->cache_by_alloc[i];
+                long b = bch_bucket_alloc(ca, watermark, cl);
+                if (b == -1)
+                        goto err;
+                k->ptr[i] = PTR(ca->buckets[b].gen,
+                                bucket_to_sector(c, b),
+                                ca->sb.nr_this_dev);
+                SET_KEY_PTRS(k, i + 1);
+        }
+        return 0;
+err:
+        bch_bucket_free(c, k);
+        __bkey_put(c, k);
+        return -1;
+}
+int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
+                         struct bkey *k, int n, struct closure *cl)
+{
+        int ret;
+        mutex_lock(&c->bucket_lock);
+        ret = __bch_bucket_alloc_set(c, watermark, k, n, cl);
+        mutex_unlock(&c->bucket_lock);
+        return ret;
+}
+/* Init */
+void bch_cache_allocator_exit(struct cache *ca)
+{
+        struct discard *d;
+        while (!list_empty(&ca->discards)) {
+                d = list_first_entry(&ca->discards, struct discard, list);
+                cancel_work_sync(&d->work);
+                list_del(&d->list);
+                kfree(d);
+        }
+}
+int bch_cache_allocator_init(struct cache *ca)
+{
+        unsigned i;
+        /*
+         * Reserve:
+         * Prio/gen writes first
+         * Then 8 for btree allocations
+         * Then half for the moving garbage collector
+         */
+        ca->watermark[WATERMARK_PRIO] = 0;
+        ca->watermark[WATERMARK_METADATA] = prio_buckets(ca);
+        ca->watermark[WATERMARK_MOVINGGC] = 8 +
+                ca->watermark[WATERMARK_METADATA];
+        ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
+                ca->watermark[WATERMARK_MOVINGGC];
+        for (i = 0; i < MAX_IN_FLIGHT_DISCARDS; i++) {
+                struct discard *d = kzalloc(sizeof(*d), GFP_KERNEL);
+                if (!d)
+                        return -ENOMEM;
+                d->ca = ca;
+                INIT_WORK(&d->work, discard_finish);
+                list_add(&d->list, &ca->discards);
+        }
+        return 0;
+}
author	Kent Overstreet <koverstreet@google.com>	2013-03-23 19:11:31 -0400
committer	Kent Overstreet <koverstreet@google.com>	2013-03-23 19:11:31 -0400
commit	cafe563591446cf80bfbc2fe3bc72a2e36cf1060 (patch)
tree	c8ae27b13dcdb0219634376ca5e667df32b1173a /drivers/md/bcache/alloc.c
parent	ea6749c705d9e629ed03c7336cc929fc6014b834 (diff)

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c new file mode 100644 index 000000000000..ed18115e078e --- /dev/null +++ b/drivers/md/bcache/alloc.c
@@ -0,0 +1,583 @@
	1	/*
	2	* Primary bucket allocation code
	3	*
	4	* Copyright 2012 Google, Inc.
	5	*
	6	* Allocation in bcache is done in terms of buckets:
	7	*
	8	* Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
	9	* btree pointers - they must match for the pointer to be considered valid.
	10	*
	11	* Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
	12	* bucket simply by incrementing its gen.
	13	*
	14	* The gens (along with the priorities; it's really the gens are important but
	15	* the code is named as if it's the priorities) are written in an arbitrary list
	16	* of buckets on disk, with a pointer to them in the journal header.
	17	*
	18	* When we invalidate a bucket, we have to write its new gen to disk and wait
	19	* for that write to complete before we use it - otherwise after a crash we
	20	* could have pointers that appeared to be good but pointed to data that had
	21	* been overwritten.
	22	*
	23	* Since the gens and priorities are all stored contiguously on disk, we can
	24	* batch this up: We fill up the free_inc list with freshly invalidated buckets,
	25	* call prio_write(), and when prio_write() finishes we pull buckets off the
	26	* free_inc list and optionally discard them.
	27	*
	28	* free_inc isn't the only freelist - if it was, we'd often to sleep while
	29	* priorities and gens were being written before we could allocate. c->free is a
	30	* smaller freelist, and buckets on that list are always ready to be used.
	31	*
	32	* If we've got discards enabled, that happens when a bucket moves from the
	33	* free_inc list to the free list.
	34	*
	35	* There is another freelist, because sometimes we have buckets that we know
	36	* have nothing pointing into them - these we can reuse without waiting for
	37	* priorities to be rewritten. These come from freed btree nodes and buckets
	38	* that garbage collection discovered no longer had valid keys pointing into
	39	* them (because they were overwritten). That's the unused list - buckets on the
	40	* unused list move to the free list, optionally being discarded in the process.
	41	*
	42	* It's also important to ensure that gens don't wrap around - with respect to
	43	* either the oldest gen in the btree or the gen on disk. This is quite
	44	* difficult to do in practice, but we explicitly guard against it anyways - if
	45	* a bucket is in danger of wrapping around we simply skip invalidating it that
	46	* time around, and we garbage collect or rewrite the priorities sooner than we
	47	* would have otherwise.
	48	*
	49	* bch_bucket_alloc() allocates a single bucket from a specific cache.
	50	*
	51	* bch_bucket_alloc_set() allocates one or more buckets from different caches
	52	* out of a cache set.
	53	*
	54	* free_some_buckets() drives all the processes described above. It's called
	55	* from bch_bucket_alloc() and a few other places that need to make sure free
	56	* buckets are ready.
	57	*
	58	* invalidate_buckets_(lru\|fifo)() find buckets that are available to be
	59	* invalidated, and then invalidate them and stick them on the free_inc list -
	60	* in either lru or fifo order.
	61	*/
	62
	63	#include "bcache.h"
	64	#include "btree.h"
	65
	66	#include <linux/random.h>
	67
	68	#define MAX_IN_FLIGHT_DISCARDS 8U
	69
	70	/* Bucket heap / gen */
	71
	72	uint8_t bch_inc_gen(struct cache ca, struct bucket b)
	73	{
	74	uint8_t ret = ++b->gen;
	75
	76	ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b));
	77	WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX);
	78
	79	if (CACHE_SYNC(&ca->set->sb)) {
	80	ca->need_save_prio = max(ca->need_save_prio,
	81	bucket_disk_gen(b));
	82	WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX);
	83	}
	84
	85	return ret;
	86	}
	87
	88	void bch_rescale_priorities(struct cache_set *c, int sectors)
	89	{
	90	struct cache *ca;
	91	struct bucket *b;
	92	unsigned next = c->nbuckets * c->sb.bucket_size / 1024;
	93	unsigned i;
	94	int r;
	95
	96	atomic_sub(sectors, &c->rescale);
	97
	98	do {
	99	r = atomic_read(&c->rescale);
	100
	101	if (r >= 0)
	102	return;
	103	} while (atomic_cmpxchg(&c->rescale, r, r + next) != r);
	104
	105	mutex_lock(&c->bucket_lock);
	106
	107	c->min_prio = USHRT_MAX;
	108
	109	for_each_cache(ca, c, i)
	110	for_each_bucket(b, ca)
	111	if (b->prio &&
	112	b->prio != BTREE_PRIO &&
	113	!atomic_read(&b->pin)) {
	114	b->prio--;
	115	c->min_prio = min(c->min_prio, b->prio);
	116	}
	117
	118	mutex_unlock(&c->bucket_lock);
	119	}
	120
	121	/* Discard/TRIM */
	122
	123	struct discard {
	124	struct list_head list;
	125	struct work_struct work;
	126	struct cache *ca;
	127	long bucket;
	128
	129	struct bio bio;
	130	struct bio_vec bv;
	131	};
	132
	133	static void discard_finish(struct work_struct *w)
	134	{
	135	struct discard *d = container_of(w, struct discard, work);
	136	struct cache *ca = d->ca;
	137	char buf[BDEVNAME_SIZE];
	138
	139	if (!test_bit(BIO_UPTODATE, &d->bio.bi_flags)) {
	140	pr_notice("discard error on %s, disabling",
	141	bdevname(ca->bdev, buf));
	142	d->ca->discard = 0;
	143	}
	144
	145	mutex_lock(&ca->set->bucket_lock);
	146
	147	fifo_push(&ca->free, d->bucket);
	148	list_add(&d->list, &ca->discards);
	149	atomic_dec(&ca->discards_in_flight);
	150
	151	mutex_unlock(&ca->set->bucket_lock);
	152
	153	closure_wake_up(&ca->set->bucket_wait);
	154	wake_up(&ca->set->alloc_wait);
	155
	156	closure_put(&ca->set->cl);
	157	}
	158
	159	static void discard_endio(struct bio *bio, int error)
	160	{
	161	struct discard *d = container_of(bio, struct discard, bio);
	162	schedule_work(&d->work);
	163	}
	164
	165	static void do_discard(struct cache *ca, long bucket)
	166	{
	167	struct discard *d = list_first_entry(&ca->discards,
	168	struct discard, list);
	169
	170	list_del(&d->list);
	171	d->bucket = bucket;
	172
	173	atomic_inc(&ca->discards_in_flight);
	174	closure_get(&ca->set->cl);
	175
	176	bio_init(&d->bio);
	177
	178	d->bio.bi_sector = bucket_to_sector(ca->set, d->bucket);
	179	d->bio.bi_bdev = ca->bdev;
	180	d->bio.bi_rw = REQ_WRITE\|REQ_DISCARD;
	181	d->bio.bi_max_vecs = 1;
	182	d->bio.bi_io_vec = d->bio.bi_inline_vecs;
	183	d->bio.bi_size = bucket_bytes(ca);
	184	d->bio.bi_end_io = discard_endio;
	185	bio_set_prio(&d->bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
	186
	187	submit_bio(0, &d->bio);
	188	}
	189
	190	/* Allocation */
	191
	192	static inline bool can_inc_bucket_gen(struct bucket *b)
	193	{
	194	return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX &&
	195	bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX;
	196	}
	197
	198	bool bch_bucket_add_unused(struct cache ca, struct bucket b)
	199	{
	200	BUG_ON(GC_MARK(b) \|\| GC_SECTORS_USED(b));
	201
	202	if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] &&
	203	CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO)
	204	return false;
	205
	206	b->prio = 0;
	207
	208	if (can_inc_bucket_gen(b) &&
	209	fifo_push(&ca->unused, b - ca->buckets)) {
	210	atomic_inc(&b->pin);
	211	return true;
	212	}
	213
	214	return false;
	215	}
	216
	217	static bool can_invalidate_bucket(struct cache ca, struct bucket b)
	218	{
	219	return GC_MARK(b) == GC_MARK_RECLAIMABLE &&
	220	!atomic_read(&b->pin) &&
	221	can_inc_bucket_gen(b);
	222	}
	223
	224	static void invalidate_one_bucket(struct cache ca, struct bucket b)
	225	{
	226	bch_inc_gen(ca, b);
	227	b->prio = INITIAL_PRIO;
	228	atomic_inc(&b->pin);
	229	fifo_push(&ca->free_inc, b - ca->buckets);
	230	}
	231
	232	static void invalidate_buckets_lru(struct cache *ca)
	233	{
	234	unsigned bucket_prio(struct bucket *b)
	235	{
	236	return ((unsigned) (b->prio - ca->set->min_prio)) *
	237	GC_SECTORS_USED(b);
	238	}
	239
	240	bool bucket_max_cmp(struct bucket l, struct bucket r)
	241	{
	242	return bucket_prio(l) < bucket_prio(r);
	243	}
	244
	245	bool bucket_min_cmp(struct bucket l, struct bucket r)
	246	{
	247	return bucket_prio(l) > bucket_prio(r);
	248	}
	249
	250	struct bucket *b;
	251	ssize_t i;
	252
	253	ca->heap.used = 0;
	254
	255	for_each_bucket(b, ca) {
	256	if (!can_invalidate_bucket(ca, b))
	257	continue;
	258
	259	if (!GC_SECTORS_USED(b)) {
	260	if (!bch_bucket_add_unused(ca, b))
	261	return;
	262	} else {
	263	if (!heap_full(&ca->heap))
	264	heap_add(&ca->heap, b, bucket_max_cmp);
	265	else if (bucket_max_cmp(b, heap_peek(&ca->heap))) {
	266	ca->heap.data[0] = b;
	267	heap_sift(&ca->heap, 0, bucket_max_cmp);
	268	}
	269	}
	270	}
	271
	272	if (ca->heap.used * 2 < ca->heap.size)
	273	bch_queue_gc(ca->set);
	274
	275	for (i = ca->heap.used / 2 - 1; i >= 0; --i)
	276	heap_sift(&ca->heap, i, bucket_min_cmp);
	277
	278	while (!fifo_full(&ca->free_inc)) {
	279	if (!heap_pop(&ca->heap, b, bucket_min_cmp)) {
	280	/* We don't want to be calling invalidate_buckets()
	281	* multiple times when it can't do anything
	282	*/
	283	ca->invalidate_needs_gc = 1;
	284	bch_queue_gc(ca->set);
	285	return;
	286	}
	287
	288	invalidate_one_bucket(ca, b);
	289	}
	290	}
	291
	292	static void invalidate_buckets_fifo(struct cache *ca)
	293	{
	294	struct bucket *b;
	295	size_t checked = 0;
	296
	297	while (!fifo_full(&ca->free_inc)) {
	298	if (ca->fifo_last_bucket < ca->sb.first_bucket \|\|
	299	ca->fifo_last_bucket >= ca->sb.nbuckets)
	300	ca->fifo_last_bucket = ca->sb.first_bucket;
	301
	302	b = ca->buckets + ca->fifo_last_bucket++;
	303
	304	if (can_invalidate_bucket(ca, b))
	305	invalidate_one_bucket(ca, b);
	306
	307	if (++checked >= ca->sb.nbuckets) {
	308	ca->invalidate_needs_gc = 1;
	309	bch_queue_gc(ca->set);
	310	return;
	311	}
	312	}
	313	}
	314
	315	static void invalidate_buckets_random(struct cache *ca)
	316	{
	317	struct bucket *b;
	318	size_t checked = 0;
	319
	320	while (!fifo_full(&ca->free_inc)) {
	321	size_t n;
	322	get_random_bytes(&n, sizeof(n));
	323
	324	n %= (size_t) (ca->sb.nbuckets - ca->sb.first_bucket);
	325	n += ca->sb.first_bucket;
	326
	327	b = ca->buckets + n;
	328
	329	if (can_invalidate_bucket(ca, b))
	330	invalidate_one_bucket(ca, b);
	331
	332	if (++checked >= ca->sb.nbuckets / 2) {
	333	ca->invalidate_needs_gc = 1;
	334	bch_queue_gc(ca->set);
	335	return;
	336	}
	337	}
	338	}
	339
	340	static void invalidate_buckets(struct cache *ca)
	341	{
	342	if (ca->invalidate_needs_gc)
	343	return;
	344
	345	switch (CACHE_REPLACEMENT(&ca->sb)) {
	346	case CACHE_REPLACEMENT_LRU:
	347	invalidate_buckets_lru(ca);
	348	break;
	349	case CACHE_REPLACEMENT_FIFO:
	350	invalidate_buckets_fifo(ca);
	351	break;
	352	case CACHE_REPLACEMENT_RANDOM:
	353	invalidate_buckets_random(ca);
	354	break;
	355	}
	356	}
	357
	358	#define allocator_wait(ca, cond) \
	359	do { \
	360	DEFINE_WAIT(__wait); \
	361	\
	362	while (!(cond)) { \
	363	prepare_to_wait(&ca->set->alloc_wait, \
	364	&__wait, TASK_INTERRUPTIBLE); \
	365	\
	366	mutex_unlock(&(ca)->set->bucket_lock); \
	367	if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) { \
	368	finish_wait(&ca->set->alloc_wait, &__wait); \
	369	closure_return(cl); \
	370	} \
	371	\
	372	schedule(); \
	373	__set_current_state(TASK_RUNNING); \
	374	mutex_lock(&(ca)->set->bucket_lock); \
	375	} \
	376	\
	377	finish_wait(&ca->set->alloc_wait, &__wait); \
	378	} while (0)
	379
	380	void bch_allocator_thread(struct closure *cl)
	381	{
	382	struct cache *ca = container_of(cl, struct cache, alloc);
	383
	384	mutex_lock(&ca->set->bucket_lock);
	385
	386	while (1) {
	387	while (1) {
	388	long bucket;
	389
	390	if ((!atomic_read(&ca->set->prio_blocked) \|\|
	391	!CACHE_SYNC(&ca->set->sb)) &&
	392	!fifo_empty(&ca->unused))
	393	fifo_pop(&ca->unused, bucket);
	394	else if (!fifo_empty(&ca->free_inc))
	395	fifo_pop(&ca->free_inc, bucket);
	396	else
	397	break;
	398
	399	allocator_wait(ca, (int) fifo_free(&ca->free) >
	400	atomic_read(&ca->discards_in_flight));
	401
	402	if (ca->discard) {
	403	allocator_wait(ca, !list_empty(&ca->discards));
	404	do_discard(ca, bucket);
	405	} else {
	406	fifo_push(&ca->free, bucket);
	407	closure_wake_up(&ca->set->bucket_wait);
	408	}
	409	}
	410
	411	allocator_wait(ca, ca->set->gc_mark_valid);
	412	invalidate_buckets(ca);
	413
	414	allocator_wait(ca, !atomic_read(&ca->set->prio_blocked) \|\|
	415	!CACHE_SYNC(&ca->set->sb));
	416
	417	if (CACHE_SYNC(&ca->set->sb) &&
	418	(!fifo_empty(&ca->free_inc) \|\|
	419	ca->need_save_prio > 64)) {
	420	bch_prio_write(ca);
	421	}
	422	}
	423	}
	424
	425	long bch_bucket_alloc(struct cache ca, unsigned watermark, struct closure cl)
	426	{
	427	long r = -1;
	428	again:
	429	wake_up(&ca->set->alloc_wait);
	430
	431	if (fifo_used(&ca->free) > ca->watermark[watermark] &&
	432	fifo_pop(&ca->free, r)) {
	433	struct bucket *b = ca->buckets + r;
	434	#ifdef CONFIG_BCACHE_EDEBUG
	435	size_t iter;
	436	long i;
	437
	438	for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
	439	BUG_ON(ca->prio_buckets[iter] == (uint64_t) r);
	440
	441	fifo_for_each(i, &ca->free, iter)
	442	BUG_ON(i == r);
	443	fifo_for_each(i, &ca->free_inc, iter)
	444	BUG_ON(i == r);
	445	fifo_for_each(i, &ca->unused, iter)
	446	BUG_ON(i == r);
	447	#endif
	448	BUG_ON(atomic_read(&b->pin) != 1);
	449
	450	SET_GC_SECTORS_USED(b, ca->sb.bucket_size);
	451
	452	if (watermark <= WATERMARK_METADATA) {
	453	SET_GC_MARK(b, GC_MARK_METADATA);
	454	b->prio = BTREE_PRIO;
	455	} else {
	456	SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
	457	b->prio = INITIAL_PRIO;
	458	}
	459
	460	return r;
	461	}
	462
	463	pr_debug("alloc failure: blocked %i free %zu free_inc %zu unused %zu",
	464	atomic_read(&ca->set->prio_blocked), fifo_used(&ca->free),
	465	fifo_used(&ca->free_inc), fifo_used(&ca->unused));
	466
	467	if (cl) {
	468	closure_wait(&ca->set->bucket_wait, cl);
	469
	470	if (closure_blocking(cl)) {
	471	mutex_unlock(&ca->set->bucket_lock);
	472	closure_sync(cl);
	473	mutex_lock(&ca->set->bucket_lock);
	474	goto again;
	475	}
	476	}
	477
	478	return -1;
	479	}
	480
	481	void bch_bucket_free(struct cache_set c, struct bkey k)
	482	{
	483	unsigned i;
	484
	485	for (i = 0; i < KEY_PTRS(k); i++) {
	486	struct bucket *b = PTR_BUCKET(c, k, i);
	487
	488	SET_GC_MARK(b, 0);
	489	SET_GC_SECTORS_USED(b, 0);
	490	bch_bucket_add_unused(PTR_CACHE(c, k, i), b);
	491	}
	492	}
	493
	494	int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
	495	struct bkey k, int n, struct closure cl)
	496	{
	497	int i;
	498
	499	lockdep_assert_held(&c->bucket_lock);
	500	BUG_ON(!n \|\| n > c->caches_loaded \|\| n > 8);
	501
	502	bkey_init(k);
	503
	504	/* sort by free space/prio of oldest data in caches */
	505
	506	for (i = 0; i < n; i++) {
	507	struct cache *ca = c->cache_by_alloc[i];
	508	long b = bch_bucket_alloc(ca, watermark, cl);
	509
	510	if (b == -1)
	511	goto err;
	512
	513	k->ptr[i] = PTR(ca->buckets[b].gen,
	514	bucket_to_sector(c, b),
	515	ca->sb.nr_this_dev);
	516
	517	SET_KEY_PTRS(k, i + 1);
	518	}
	519
	520	return 0;
	521	err:
	522	bch_bucket_free(c, k);
	523	__bkey_put(c, k);
	524	return -1;
	525	}
	526
	527	int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
	528	struct bkey k, int n, struct closure cl)
	529	{
	530	int ret;
	531	mutex_lock(&c->bucket_lock);
	532	ret = __bch_bucket_alloc_set(c, watermark, k, n, cl);
	533	mutex_unlock(&c->bucket_lock);
	534	return ret;
	535	}
	536
	537	/* Init */
	538
	539	void bch_cache_allocator_exit(struct cache *ca)
	540	{
	541	struct discard *d;
	542
	543	while (!list_empty(&ca->discards)) {
	544	d = list_first_entry(&ca->discards, struct discard, list);
	545	cancel_work_sync(&d->work);
	546	list_del(&d->list);
	547	kfree(d);
	548	}
	549	}
	550
	551	int bch_cache_allocator_init(struct cache *ca)
	552	{
	553	unsigned i;
	554
	555	/*
	556	* Reserve:
	557	* Prio/gen writes first
	558	* Then 8 for btree allocations
	559	* Then half for the moving garbage collector
	560	*/
	561
	562	ca->watermark[WATERMARK_PRIO] = 0;
	563
	564	ca->watermark[WATERMARK_METADATA] = prio_buckets(ca);
	565
	566	ca->watermark[WATERMARK_MOVINGGC] = 8 +
	567	ca->watermark[WATERMARK_METADATA];
	568
	569	ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
	570	ca->watermark[WATERMARK_MOVINGGC];
	571
	572	for (i = 0; i < MAX_IN_FLIGHT_DISCARDS; i++) {
	573	struct discard d = kzalloc(sizeof(d), GFP_KERNEL);
	574	if (!d)
	575	return -ENOMEM;
	576
	577	d->ca = ca;
	578	INIT_WORK(&d->work, discard_finish);
	579	list_add(&d->list, &ca->discards);
	580	}
	581
	582	return 0;
	583	}