bcache: A block layer cache

Does writethrough and writeback caching, handles unclean shutdown, and has a bunch of other nifty features motivated by real world usage. See the wiki at http://bcache.evilpiepirate.org for more. Signed-off-by: Kent Overstreet <koverstreet@google.com>
author: Kent Overstreet <koverstreet@google.com> 2013-03-23 19:11:31 -0400
committer: Kent Overstreet <koverstreet@google.com> 2013-03-23 19:11:31 -0400
commit: cafe563591446cf80bfbc2fe3bc72a2e36cf1060 (patch)
tree: c8ae27b13dcdb0219634376ca5e667df32b1173a /drivers/md/bcache/journal.c
parent: ea6749c705d9e629ed03c7336cc929fc6014b834 (diff)
1 files changed, 785 insertions, 0 deletions
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
new file mode 100644
index 000000000000..c871ffaabbb0
--- /dev/null
+++ b/drivers/md/bcache/journal.c
@@ -0,0 +1,785 @@
+/*
+ * bcache journalling code, for btree insertions
+ *
+ * Copyright 2012 Google, Inc.
+ */
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "request.h"
+/*
+ * Journal replay/recovery:
+ *
+ * This code is all driven from run_cache_set(); we first read the journal
+ * entries, do some other stuff, then we mark all the keys in the journal
+ * entries (same as garbage collection would), then we replay them - reinserting
+ * them into the cache in precisely the same order as they appear in the
+ * journal.
+ *
+ * We only journal keys that go in leaf nodes, which simplifies things quite a
+ * bit.
+ */
+static void journal_read_endio(struct bio *bio, int error)
+{
+        struct closure *cl = bio->bi_private;
+        closure_put(cl);
+}
+static int journal_read_bucket(struct cache *ca, struct list_head *list,
+                               struct btree_op *op, unsigned bucket_index)
+{
+        struct journal_device *ja = &ca->journal;
+        struct bio *bio = &ja->bio;
+        struct journal_replay *i;
+        struct jset *j, *data = ca->set->journal.w[0].data;
+        unsigned len, left, offset = 0;
+        int ret = 0;
+        sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]);
+        pr_debug("reading %llu", (uint64_t) bucket);
+        while (offset < ca->sb.bucket_size) {
+reread:         left = ca->sb.bucket_size - offset;
+                len = min_t(unsigned, left, PAGE_SECTORS * 8);
+                bio_reset(bio);
+                bio->bi_sector  = bucket + offset;
+                bio->bi_bdev    = ca->bdev;
+                bio->bi_rw      = READ;
+                bio->bi_size    = len << 9;
+                bio->bi_end_io  = journal_read_endio;
+                bio->bi_private = &op->cl;
+                bio_map(bio, data);
+                closure_bio_submit(bio, &op->cl, ca);
+                closure_sync(&op->cl);
+                /* This function could be simpler now since we no longer write
+                 * journal entries that overlap bucket boundaries; this means
+                 * the start of a bucket will always have a valid journal entry
+                 * if it has any journal entries at all.
+                 */
+                j = data;
+                while (len) {
+                        struct list_head *where;
+                        size_t blocks, bytes = set_bytes(j);
+                        if (j->magic != jset_magic(ca->set))
+                                return ret;
+                        if (bytes > left << 9)
+                                return ret;
+                        if (bytes > len << 9)
+                                goto reread;
+                        if (j->csum != csum_set(j))
+                                return ret;
+                        blocks = set_blocks(j, ca->set);
+                        while (!list_empty(list)) {
+                                i = list_first_entry(list,
+                                        struct journal_replay, list);
+                                if (i->j.seq >= j->last_seq)
+                                        break;
+                                list_del(&i->list);
+                                kfree(i);
+                        }
+                        list_for_each_entry_reverse(i, list, list) {
+                                if (j->seq == i->j.seq)
+                                        goto next_set;
+                                if (j->seq < i->j.last_seq)
+                                        goto next_set;
+                                if (j->seq > i->j.seq) {
+                                        where = &i->list;
+                                        goto add;
+                                }
+                        }
+                        where = list;
+add:
+                        i = kmalloc(offsetof(struct journal_replay, j) +
+                                    bytes, GFP_KERNEL);
+                        if (!i)
+                                return -ENOMEM;
+                        memcpy(&i->j, j, bytes);
+                        list_add(&i->list, where);
+                        ret = 1;
+                        ja->seq[bucket_index] = j->seq;
+next_set:
+                        offset  += blocks * ca->sb.block_size;
+                        len     -= blocks * ca->sb.block_size;
+                        j = ((void *) j) + blocks * block_bytes(ca);
+                }
+        }
+        return ret;
+}
+int bch_journal_read(struct cache_set *c, struct list_head *list,
+                        struct btree_op *op)
+{
+#define read_bucket(b)                                                  \
+        ({                                                              \
+                int ret = journal_read_bucket(ca, list, op, b);         \
+                __set_bit(b, bitmap);                                   \
+                if (ret < 0)                                            \
+                        return ret;                                     \
+                ret;                                                    \
+        })
+        struct cache *ca;
+        unsigned iter;
+        for_each_cache(ca, c, iter) {
+                struct journal_device *ja = &ca->journal;
+                unsigned long bitmap[SB_JOURNAL_BUCKETS / BITS_PER_LONG];
+                unsigned i, l, r, m;
+                uint64_t seq;
+                bitmap_zero(bitmap, SB_JOURNAL_BUCKETS);
+                pr_debug("%u journal buckets", ca->sb.njournal_buckets);
+                /* Read journal buckets ordered by golden ratio hash to quickly
+                 * find a sequence of buckets with valid journal entries
+                 */
+                for (i = 0; i < ca->sb.njournal_buckets; i++) {
+                        l = (i * 2654435769U) % ca->sb.njournal_buckets;
+                        if (test_bit(l, bitmap))
+                                break;
+                        if (read_bucket(l))
+                                goto bsearch;
+                }
+                /* If that fails, check all the buckets we haven't checked
+                 * already
+                 */
+                pr_debug("falling back to linear search");
+                for (l = 0; l < ca->sb.njournal_buckets; l++) {
+                        if (test_bit(l, bitmap))
+                                continue;
+                        if (read_bucket(l))
+                                goto bsearch;
+                }
+bsearch:
+                /* Binary search */
+                m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
+                pr_debug("starting binary search, l %u r %u", l, r);
+                while (l + 1 < r) {
+                        m = (l + r) >> 1;
+                        if (read_bucket(m))
+                                l = m;
+                        else
+                                r = m;
+                }
+                /* Read buckets in reverse order until we stop finding more
+                 * journal entries
+                 */
+                pr_debug("finishing up");
+                l = m;
+                while (1) {
+                        if (!l--)
+                                l = ca->sb.njournal_buckets - 1;
+                        if (l == m)
+                                break;
+                        if (test_bit(l, bitmap))
+                                continue;
+                        if (!read_bucket(l))
+                                break;
+                }
+                seq = 0;
+                for (i = 0; i < ca->sb.njournal_buckets; i++)
+                        if (ja->seq[i] > seq) {
+                                seq = ja->seq[i];
+                                ja->cur_idx = ja->discard_idx =
+                                        ja->last_idx = i;
+                        }
+        }
+        c->journal.seq = list_entry(list->prev,
+                                    struct journal_replay,
+                                    list)->j.seq;
+        return 0;
+#undef read_bucket
+}
+void bch_journal_mark(struct cache_set *c, struct list_head *list)
+{
+        atomic_t p = { 0 };
+        struct bkey *k;
+        struct journal_replay *i;
+        struct journal *j = &c->journal;
+        uint64_t last = j->seq;
+        /*
+         * journal.pin should never fill up - we never write a journal
+         * entry when it would fill up. But if for some reason it does, we
+         * iterate over the list in reverse order so that we can just skip that
+         * refcount instead of bugging.
+         */
+        list_for_each_entry_reverse(i, list, list) {
+                BUG_ON(last < i->j.seq);
+                i->pin = NULL;
+                while (last-- != i->j.seq)
+                        if (fifo_free(&j->pin) > 1) {
+                                fifo_push_front(&j->pin, p);
+                                atomic_set(&fifo_front(&j->pin), 0);
+                        }
+                if (fifo_free(&j->pin) > 1) {
+                        fifo_push_front(&j->pin, p);
+                        i->pin = &fifo_front(&j->pin);
+                        atomic_set(i->pin, 1);
+                }
+                for (k = i->j.start;
+                     k < end(&i->j);
+                     k = bkey_next(k)) {
+                        unsigned j;
+                        for (j = 0; j < KEY_PTRS(k); j++) {
+                                struct bucket *g = PTR_BUCKET(c, k, j);
+                                atomic_inc(&g->pin);
+                                if (g->prio == BTREE_PRIO &&
+                                    !ptr_stale(c, k, j))
+                                        g->prio = INITIAL_PRIO;
+                        }
+                        __bch_btree_mark_key(c, 0, k);
+                }
+        }
+}
+int bch_journal_replay(struct cache_set *s, struct list_head *list,
+                          struct btree_op *op)
+{
+        int ret = 0, keys = 0, entries = 0;
+        struct bkey *k;
+        struct journal_replay *i =
+                list_entry(list->prev, struct journal_replay, list);
+        uint64_t start = i->j.last_seq, end = i->j.seq, n = start;
+        list_for_each_entry(i, list, list) {
+                BUG_ON(i->pin && atomic_read(i->pin) != 1);
+                if (n != i->j.seq)
+                        pr_err("journal entries %llu-%llu "
+                               "missing! (replaying %llu-%llu)\n",
+                               n, i->j.seq - 1, start, end);
+                for (k = i->j.start;
+                     k < end(&i->j);
+                     k = bkey_next(k)) {
+                        pr_debug("%s", pkey(k));
+                        bkey_copy(op->keys.top, k);
+                        bch_keylist_push(&op->keys);
+                        op->journal = i->pin;
+                        atomic_inc(op->journal);
+                        ret = bch_btree_insert(op, s);
+                        if (ret)
+                                goto err;
+                        BUG_ON(!bch_keylist_empty(&op->keys));
+                        keys++;
+                        cond_resched();
+                }
+                if (i->pin)
+                        atomic_dec(i->pin);
+                n = i->j.seq + 1;
+                entries++;
+        }
+        pr_info("journal replay done, %i keys in %i entries, seq %llu",
+                keys, entries, end);
+        while (!list_empty(list)) {
+                i = list_first_entry(list, struct journal_replay, list);
+                list_del(&i->list);
+                kfree(i);
+        }
+err:
+        closure_sync(&op->cl);
+        return ret;
+}
+/* Journalling */
+static void btree_flush_write(struct cache_set *c)
+{
+        /*
+         * Try to find the btree node with that references the oldest journal
+         * entry, best is our current candidate and is locked if non NULL:
+         */
+        struct btree *b, *best = NULL;
+        unsigned iter;
+        for_each_cached_btree(b, c, iter) {
+                if (!down_write_trylock(&b->lock))
+                        continue;
+                if (!btree_node_dirty(b) ||
+                    !btree_current_write(b)->journal) {
+                        rw_unlock(true, b);
+                        continue;
+                }
+                if (!best)
+                        best = b;
+                else if (journal_pin_cmp(c,
+                                         btree_current_write(best),
+                                         btree_current_write(b))) {
+                        rw_unlock(true, best);
+                        best = b;
+                } else
+                        rw_unlock(true, b);
+        }
+        if (best)
+                goto out;
+        /* We can't find the best btree node, just pick the first */
+        list_for_each_entry(b, &c->btree_cache, list)
+                if (!b->level && btree_node_dirty(b)) {
+                        best = b;
+                        rw_lock(true, best, best->level);
+                        goto found;
+                }
+out:
+        if (!best)
+                return;
+found:
+        if (btree_node_dirty(best))
+                bch_btree_write(best, true, NULL);
+        rw_unlock(true, best);
+}
+#define last_seq(j)     ((j)->seq - fifo_used(&(j)->pin) + 1)
+static void journal_discard_endio(struct bio *bio, int error)
+{
+        struct journal_device *ja =
+                container_of(bio, struct journal_device, discard_bio);
+        struct cache *ca = container_of(ja, struct cache, journal);
+        atomic_set(&ja->discard_in_flight, DISCARD_DONE);
+        closure_wake_up(&ca->set->journal.wait);
+        closure_put(&ca->set->cl);
+}
+static void journal_discard_work(struct work_struct *work)
+{
+        struct journal_device *ja =
+                container_of(work, struct journal_device, discard_work);
+        submit_bio(0, &ja->discard_bio);
+}
+static void do_journal_discard(struct cache *ca)
+{
+        struct journal_device *ja = &ca->journal;
+        struct bio *bio = &ja->discard_bio;
+        if (!ca->discard) {
+                ja->discard_idx = ja->last_idx;
+                return;
+        }
+        switch (atomic_read(&ja->discard_in_flight) == DISCARD_IN_FLIGHT) {
+        case DISCARD_IN_FLIGHT:
+                return;
+        case DISCARD_DONE:
+                ja->discard_idx = (ja->discard_idx + 1) %
+                        ca->sb.njournal_buckets;
+                atomic_set(&ja->discard_in_flight, DISCARD_READY);
+                /* fallthrough */
+        case DISCARD_READY:
+                if (ja->discard_idx == ja->last_idx)
+                        return;
+                atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT);
+                bio_init(bio);
+                bio->bi_sector          = bucket_to_sector(ca->set,
+                                                           ca->sb.d[ja->discard_idx]);
+                bio->bi_bdev            = ca->bdev;
+                bio->bi_rw              = REQ_WRITE|REQ_DISCARD;
+                bio->bi_max_vecs        = 1;
+                bio->bi_io_vec          = bio->bi_inline_vecs;
+                bio->bi_size            = bucket_bytes(ca);
+                bio->bi_end_io          = journal_discard_endio;
+                closure_get(&ca->set->cl);
+                INIT_WORK(&ja->discard_work, journal_discard_work);
+                schedule_work(&ja->discard_work);
+        }
+}
+static void journal_reclaim(struct cache_set *c)
+{
+        struct bkey *k = &c->journal.key;
+        struct cache *ca;
+        uint64_t last_seq;
+        unsigned iter, n = 0;
+        atomic_t p;
+        while (!atomic_read(&fifo_front(&c->journal.pin)))
+                fifo_pop(&c->journal.pin, p);
+        last_seq = last_seq(&c->journal);
+        /* Update last_idx */
+        for_each_cache(ca, c, iter) {
+                struct journal_device *ja = &ca->journal;
+                while (ja->last_idx != ja->cur_idx &&
+                       ja->seq[ja->last_idx] < last_seq)
+                        ja->last_idx = (ja->last_idx + 1) %
+                                ca->sb.njournal_buckets;
+        }
+        for_each_cache(ca, c, iter)
+                do_journal_discard(ca);
+        if (c->journal.blocks_free)
+                return;
+        /*
+         * Allocate:
+         * XXX: Sort by free journal space
+         */
+        for_each_cache(ca, c, iter) {
+                struct journal_device *ja = &ca->journal;
+                unsigned next = (ja->cur_idx + 1) % ca->sb.njournal_buckets;
+                /* No space available on this device */
+                if (next == ja->discard_idx)
+                        continue;
+                ja->cur_idx = next;
+                k->ptr[n++] = PTR(0,
+                                  bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
+                                  ca->sb.nr_this_dev);
+        }
+        bkey_init(k);
+        SET_KEY_PTRS(k, n);
+        if (n)
+                c->journal.blocks_free = c->sb.bucket_size >> c->block_bits;
+        if (!journal_full(&c->journal))
+                __closure_wake_up(&c->journal.wait);
+}
+void bch_journal_next(struct journal *j)
+{
+        atomic_t p = { 1 };
+        j->cur = (j->cur == j->w)
+                ? &j->w[1]
+                : &j->w[0];
+        /*
+         * The fifo_push() needs to happen at the same time as j->seq is
+         * incremented for last_seq() to be calculated correctly
+         */
+        BUG_ON(!fifo_push(&j->pin, p));
+        atomic_set(&fifo_back(&j->pin), 1);
+        j->cur->data->seq       = ++j->seq;
+        j->cur->need_write      = false;
+        j->cur->data->keys      = 0;
+        if (fifo_full(&j->pin))
+                pr_debug("journal_pin full (%zu)", fifo_used(&j->pin));
+}
+static void journal_write_endio(struct bio *bio, int error)
+{
+        struct journal_write *w = bio->bi_private;
+        cache_set_err_on(error, w->c, "journal io error");
+        closure_put(&w->c->journal.io.cl);
+}
+static void journal_write(struct closure *);
+static void journal_write_done(struct closure *cl)
+{
+        struct journal *j = container_of(cl, struct journal, io.cl);
+        struct cache_set *c = container_of(j, struct cache_set, journal);
+        struct journal_write *w = (j->cur == j->w)
+                ? &j->w[1]
+                : &j->w[0];
+        __closure_wake_up(&w->wait);
+        if (c->journal_delay_ms)
+                closure_delay(&j->io, msecs_to_jiffies(c->journal_delay_ms));
+        continue_at(cl, journal_write, system_wq);
+}
+static void journal_write_unlocked(struct closure *cl)
+{
+        struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl);
+        struct cache *ca;
+        struct journal_write *w = c->journal.cur;
+        struct bkey *k = &c->journal.key;
+        unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size;
+        struct bio *bio;
+        struct bio_list list;
+        bio_list_init(&list);
+        if (!w->need_write) {
+                /*
+                 * XXX: have to unlock closure before we unlock journal lock,
+                 * else we race with bch_journal(). But this way we race
+                 * against cache set unregister. Doh.
+                 */
+                set_closure_fn(cl, NULL, NULL);
+                closure_sub(cl, CLOSURE_RUNNING + 1);
+                spin_unlock(&c->journal.lock);
+                return;
+        } else if (journal_full(&c->journal)) {
+                journal_reclaim(c);
+                spin_unlock(&c->journal.lock);
+                btree_flush_write(c);
+                continue_at(cl, journal_write, system_wq);
+        }
+        c->journal.blocks_free -= set_blocks(w->data, c);
+        w->data->btree_level = c->root->level;
+        bkey_copy(&w->data->btree_root, &c->root->key);
+        bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket);
+        for_each_cache(ca, c, i)
+                w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
+        w->data->magic          = jset_magic(c);
+        w->data->version        = BCACHE_JSET_VERSION;
+        w->data->last_seq       = last_seq(&c->journal);
+        w->data->csum           = csum_set(w->data);
+        for (i = 0; i < KEY_PTRS(k); i++) {
+                ca = PTR_CACHE(c, k, i);
+                bio = &ca->journal.bio;
+                atomic_long_add(sectors, &ca->meta_sectors_written);
+                bio_reset(bio);
+                bio->bi_sector  = PTR_OFFSET(k, i);
+                bio->bi_bdev    = ca->bdev;
+                bio->bi_rw      = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH;
+                bio->bi_size    = sectors << 9;
+                bio->bi_end_io  = journal_write_endio;
+                bio->bi_private = w;
+                bio_map(bio, w->data);
+                trace_bcache_journal_write(bio);
+                bio_list_add(&list, bio);
+                SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors);
+                ca->journal.seq[ca->journal.cur_idx] = w->data->seq;
+        }
+        atomic_dec_bug(&fifo_back(&c->journal.pin));
+        bch_journal_next(&c->journal);
+        journal_reclaim(c);
+        spin_unlock(&c->journal.lock);
+        while ((bio = bio_list_pop(&list)))
+                closure_bio_submit(bio, cl, c->cache[0]);
+        continue_at(cl, journal_write_done, NULL);
+}
+static void journal_write(struct closure *cl)
+{
+        struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl);
+        spin_lock(&c->journal.lock);
+        journal_write_unlocked(cl);
+}
+static void __journal_try_write(struct cache_set *c, bool noflush)
+{
+        struct closure *cl = &c->journal.io.cl;
+        if (!closure_trylock(cl, &c->cl))
+                spin_unlock(&c->journal.lock);
+        else if (noflush && journal_full(&c->journal)) {
+                spin_unlock(&c->journal.lock);
+                continue_at(cl, journal_write, system_wq);
+        } else
+                journal_write_unlocked(cl);
+}
+#define journal_try_write(c)    __journal_try_write(c, false)
+void bch_journal_meta(struct cache_set *c, struct closure *cl)
+{
+        struct journal_write *w;
+        if (CACHE_SYNC(&c->sb)) {
+                spin_lock(&c->journal.lock);
+                w = c->journal.cur;
+                w->need_write = true;
+                if (cl)
+                        BUG_ON(!closure_wait(&w->wait, cl));
+                __journal_try_write(c, true);
+        }
+}
+/*
+ * Entry point to the journalling code - bio_insert() and btree_invalidate()
+ * pass bch_journal() a list of keys to be journalled, and then
+ * bch_journal() hands those same keys off to btree_insert_async()
+ */
+void bch_journal(struct closure *cl)
+{
+        struct btree_op *op = container_of(cl, struct btree_op, cl);
+        struct cache_set *c = op->c;
+        struct journal_write *w;
+        size_t b, n = ((uint64_t *) op->keys.top) - op->keys.list;
+        if (op->type != BTREE_INSERT ||
+            !CACHE_SYNC(&c->sb))
+                goto out;
+        /*
+         * If we're looping because we errored, might already be waiting on
+         * another journal write:
+         */
+        while (atomic_read(&cl->parent->remaining) & CLOSURE_WAITING)
+                closure_sync(cl->parent);
+        spin_lock(&c->journal.lock);
+        if (journal_full(&c->journal)) {
+                /* XXX: tracepoint */
+                closure_wait(&c->journal.wait, cl);
+                journal_reclaim(c);
+                spin_unlock(&c->journal.lock);
+                btree_flush_write(c);
+                continue_at(cl, bch_journal, bcache_wq);
+        }
+        w = c->journal.cur;
+        w->need_write = true;
+        b = __set_blocks(w->data, w->data->keys + n, c);
+        if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS ||
+            b > c->journal.blocks_free) {
+                /* XXX: If we were inserting so many keys that they won't fit in
+                 * an _empty_ journal write, we'll deadlock. For now, handle
+                 * this in bch_keylist_realloc() - but something to think about.
+                 */
+                BUG_ON(!w->data->keys);
+                /* XXX: tracepoint */
+                BUG_ON(!closure_wait(&w->wait, cl));
+                closure_flush(&c->journal.io);
+                journal_try_write(c);
+                continue_at(cl, bch_journal, bcache_wq);
+        }
+        memcpy(end(w->data), op->keys.list, n * sizeof(uint64_t));
+        w->data->keys += n;
+        op->journal = &fifo_back(&c->journal.pin);
+        atomic_inc(op->journal);
+        if (op->flush_journal) {
+                closure_flush(&c->journal.io);
+                closure_wait(&w->wait, cl->parent);
+        }
+        journal_try_write(c);
+out:
+        bch_btree_insert_async(cl);
+}
+void bch_journal_free(struct cache_set *c)
+{
+        free_pages((unsigned long) c->journal.w[1].data, JSET_BITS);
+        free_pages((unsigned long) c->journal.w[0].data, JSET_BITS);
+        free_fifo(&c->journal.pin);
+}
+int bch_journal_alloc(struct cache_set *c)
+{
+        struct journal *j = &c->journal;
+        closure_init_unlocked(&j->io);
+        spin_lock_init(&j->lock);
+        c->journal_delay_ms = 100;
+        j->w[0].c = c;
+        j->w[1].c = c;
+        if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
+            !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) ||
+            !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)))
+                return -ENOMEM;
+        return 0;
+}
author	Kent Overstreet <koverstreet@google.com>	2013-03-23 19:11:31 -0400
committer	Kent Overstreet <koverstreet@google.com>	2013-03-23 19:11:31 -0400
commit	cafe563591446cf80bfbc2fe3bc72a2e36cf1060 (patch)
tree	c8ae27b13dcdb0219634376ca5e667df32b1173a /drivers/md/bcache/journal.c
parent	ea6749c705d9e629ed03c7336cc929fc6014b834 (diff)

diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c new file mode 100644 index 000000000000..c871ffaabbb0 --- /dev/null +++ b/drivers/md/bcache/journal.c
@@ -0,0 +1,785 @@
	1	/*
	2	* bcache journalling code, for btree insertions
	3	*
	4	* Copyright 2012 Google, Inc.
	5	*/
	6
	7	#include "bcache.h"
	8	#include "btree.h"
	9	#include "debug.h"
	10	#include "request.h"
	11
	12	/*
	13	* Journal replay/recovery:
	14	*
	15	* This code is all driven from run_cache_set(); we first read the journal
	16	* entries, do some other stuff, then we mark all the keys in the journal
	17	* entries (same as garbage collection would), then we replay them - reinserting
	18	* them into the cache in precisely the same order as they appear in the
	19	* journal.
	20	*
	21	* We only journal keys that go in leaf nodes, which simplifies things quite a
	22	* bit.
	23	*/
	24
	25	static void journal_read_endio(struct bio *bio, int error)
	26	{
	27	struct closure *cl = bio->bi_private;
	28	closure_put(cl);
	29	}
	30
	31	static int journal_read_bucket(struct cache ca, struct list_head list,
	32	struct btree_op *op, unsigned bucket_index)
	33	{
	34	struct journal_device *ja = &ca->journal;
	35	struct bio *bio = &ja->bio;
	36
	37	struct journal_replay *i;
	38	struct jset j, data = ca->set->journal.w[0].data;
	39	unsigned len, left, offset = 0;
	40	int ret = 0;
	41	sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]);
	42
	43	pr_debug("reading %llu", (uint64_t) bucket);
	44
	45	while (offset < ca->sb.bucket_size) {
	46	reread: left = ca->sb.bucket_size - offset;
	47	len = min_t(unsigned, left, PAGE_SECTORS * 8);
	48
	49	bio_reset(bio);
	50	bio->bi_sector = bucket + offset;
	51	bio->bi_bdev = ca->bdev;
	52	bio->bi_rw = READ;
	53	bio->bi_size = len << 9;
	54
	55	bio->bi_end_io = journal_read_endio;
	56	bio->bi_private = &op->cl;
	57	bio_map(bio, data);
	58
	59	closure_bio_submit(bio, &op->cl, ca);
	60	closure_sync(&op->cl);
	61
	62	/* This function could be simpler now since we no longer write
	63	* journal entries that overlap bucket boundaries; this means
	64	* the start of a bucket will always have a valid journal entry
	65	* if it has any journal entries at all.
	66	*/
	67
	68	j = data;
	69	while (len) {
	70	struct list_head *where;
	71	size_t blocks, bytes = set_bytes(j);
	72
	73	if (j->magic != jset_magic(ca->set))
	74	return ret;
	75
	76	if (bytes > left << 9)
	77	return ret;
	78
	79	if (bytes > len << 9)
	80	goto reread;
	81
	82	if (j->csum != csum_set(j))
	83	return ret;
	84
	85	blocks = set_blocks(j, ca->set);
	86
	87	while (!list_empty(list)) {
	88	i = list_first_entry(list,
	89	struct journal_replay, list);
	90	if (i->j.seq >= j->last_seq)
	91	break;
	92	list_del(&i->list);
	93	kfree(i);
	94	}
	95
	96	list_for_each_entry_reverse(i, list, list) {
	97	if (j->seq == i->j.seq)
	98	goto next_set;
	99
	100	if (j->seq < i->j.last_seq)
	101	goto next_set;
	102
	103	if (j->seq > i->j.seq) {
	104	where = &i->list;
	105	goto add;
	106	}
	107	}
	108
	109	where = list;
	110	add:
	111	i = kmalloc(offsetof(struct journal_replay, j) +
	112	bytes, GFP_KERNEL);
	113	if (!i)
	114	return -ENOMEM;
	115	memcpy(&i->j, j, bytes);
	116	list_add(&i->list, where);
	117	ret = 1;
	118
	119	ja->seq[bucket_index] = j->seq;
	120	next_set:
	121	offset += blocks * ca->sb.block_size;
	122	len -= blocks * ca->sb.block_size;
	123	j = ((void ) j) + blocks block_bytes(ca);
	124	}
	125	}
	126
	127	return ret;
	128	}
	129
	130	int bch_journal_read(struct cache_set c, struct list_head list,
	131	struct btree_op *op)
	132	{
	133	#define read_bucket(b) \
	134	({ \
	135	int ret = journal_read_bucket(ca, list, op, b); \
	136	__set_bit(b, bitmap); \
	137	if (ret < 0) \
	138	return ret; \
	139	ret; \
	140	})
	141
	142	struct cache *ca;
	143	unsigned iter;
	144
	145	for_each_cache(ca, c, iter) {
	146	struct journal_device *ja = &ca->journal;
	147	unsigned long bitmap[SB_JOURNAL_BUCKETS / BITS_PER_LONG];
	148	unsigned i, l, r, m;
	149	uint64_t seq;
	150
	151	bitmap_zero(bitmap, SB_JOURNAL_BUCKETS);
	152	pr_debug("%u journal buckets", ca->sb.njournal_buckets);
	153
	154	/* Read journal buckets ordered by golden ratio hash to quickly
	155	* find a sequence of buckets with valid journal entries
	156	*/
	157	for (i = 0; i < ca->sb.njournal_buckets; i++) {
	158	l = (i * 2654435769U) % ca->sb.njournal_buckets;
	159
	160	if (test_bit(l, bitmap))
	161	break;
	162
	163	if (read_bucket(l))
	164	goto bsearch;
	165	}
	166
	167	/* If that fails, check all the buckets we haven't checked
	168	* already
	169	*/
	170	pr_debug("falling back to linear search");
	171
	172	for (l = 0; l < ca->sb.njournal_buckets; l++) {
	173	if (test_bit(l, bitmap))
	174	continue;
	175
	176	if (read_bucket(l))
	177	goto bsearch;
	178	}
	179	bsearch:
	180	/* Binary search */
	181	m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
	182	pr_debug("starting binary search, l %u r %u", l, r);
	183
	184	while (l + 1 < r) {
	185	m = (l + r) >> 1;
	186
	187	if (read_bucket(m))
	188	l = m;
	189	else
	190	r = m;
	191	}
	192
	193	/* Read buckets in reverse order until we stop finding more
	194	* journal entries
	195	*/
	196	pr_debug("finishing up");
	197	l = m;
	198
	199	while (1) {
	200	if (!l--)
	201	l = ca->sb.njournal_buckets - 1;
	202
	203	if (l == m)
	204	break;
	205
	206	if (test_bit(l, bitmap))
	207	continue;
	208
	209	if (!read_bucket(l))
	210	break;
	211	}
	212
	213	seq = 0;
	214
	215	for (i = 0; i < ca->sb.njournal_buckets; i++)
	216	if (ja->seq[i] > seq) {
	217	seq = ja->seq[i];
	218	ja->cur_idx = ja->discard_idx =
	219	ja->last_idx = i;
	220
	221	}
	222	}
	223
	224	c->journal.seq = list_entry(list->prev,
	225	struct journal_replay,
	226	list)->j.seq;
	227
	228	return 0;
	229	#undef read_bucket
	230	}
	231
	232	void bch_journal_mark(struct cache_set c, struct list_head list)
	233	{
	234	atomic_t p = { 0 };
	235	struct bkey *k;
	236	struct journal_replay *i;
	237	struct journal *j = &c->journal;
	238	uint64_t last = j->seq;
	239
	240	/*
	241	* journal.pin should never fill up - we never write a journal
	242	* entry when it would fill up. But if for some reason it does, we
	243	* iterate over the list in reverse order so that we can just skip that
	244	* refcount instead of bugging.
	245	*/
	246
	247	list_for_each_entry_reverse(i, list, list) {
	248	BUG_ON(last < i->j.seq);
	249	i->pin = NULL;
	250
	251	while (last-- != i->j.seq)
	252	if (fifo_free(&j->pin) > 1) {
	253	fifo_push_front(&j->pin, p);
	254	atomic_set(&fifo_front(&j->pin), 0);
	255	}
	256
	257	if (fifo_free(&j->pin) > 1) {
	258	fifo_push_front(&j->pin, p);
	259	i->pin = &fifo_front(&j->pin);
	260	atomic_set(i->pin, 1);
	261	}
	262
	263	for (k = i->j.start;
	264	k < end(&i->j);
	265	k = bkey_next(k)) {
	266	unsigned j;
	267
	268	for (j = 0; j < KEY_PTRS(k); j++) {
	269	struct bucket *g = PTR_BUCKET(c, k, j);
	270	atomic_inc(&g->pin);
	271
	272	if (g->prio == BTREE_PRIO &&
	273	!ptr_stale(c, k, j))
	274	g->prio = INITIAL_PRIO;
	275	}
	276
	277	__bch_btree_mark_key(c, 0, k);
	278	}
	279	}
	280	}
	281
	282	int bch_journal_replay(struct cache_set s, struct list_head list,
	283	struct btree_op *op)
	284	{
	285	int ret = 0, keys = 0, entries = 0;
	286	struct bkey *k;
	287	struct journal_replay *i =
	288	list_entry(list->prev, struct journal_replay, list);
	289
	290	uint64_t start = i->j.last_seq, end = i->j.seq, n = start;
	291
	292	list_for_each_entry(i, list, list) {
	293	BUG_ON(i->pin && atomic_read(i->pin) != 1);
	294
	295	if (n != i->j.seq)
	296	pr_err("journal entries %llu-%llu "
	297	"missing! (replaying %llu-%llu)\n",
	298	n, i->j.seq - 1, start, end);
	299
	300	for (k = i->j.start;
	301	k < end(&i->j);
	302	k = bkey_next(k)) {
	303	pr_debug("%s", pkey(k));
	304	bkey_copy(op->keys.top, k);
	305	bch_keylist_push(&op->keys);
	306
	307	op->journal = i->pin;
	308	atomic_inc(op->journal);
	309
	310	ret = bch_btree_insert(op, s);
	311	if (ret)
	312	goto err;
	313
	314	BUG_ON(!bch_keylist_empty(&op->keys));
	315	keys++;
	316
	317	cond_resched();
	318	}
	319
	320	if (i->pin)
	321	atomic_dec(i->pin);
	322	n = i->j.seq + 1;
	323	entries++;
	324	}
	325
	326	pr_info("journal replay done, %i keys in %i entries, seq %llu",
	327	keys, entries, end);
	328
	329	while (!list_empty(list)) {
	330	i = list_first_entry(list, struct journal_replay, list);
	331	list_del(&i->list);
	332	kfree(i);
	333	}
	334	err:
	335	closure_sync(&op->cl);
	336	return ret;
	337	}
	338
	339	/* Journalling */
	340
	341	static void btree_flush_write(struct cache_set *c)
	342	{
	343	/*
	344	* Try to find the btree node with that references the oldest journal
	345	* entry, best is our current candidate and is locked if non NULL:
	346	*/
	347	struct btree b, best = NULL;
	348	unsigned iter;
	349
	350	for_each_cached_btree(b, c, iter) {
	351	if (!down_write_trylock(&b->lock))
	352	continue;
	353
	354	if (!btree_node_dirty(b) \|\|
	355	!btree_current_write(b)->journal) {
	356	rw_unlock(true, b);
	357	continue;
	358	}
	359
	360	if (!best)
	361	best = b;
	362	else if (journal_pin_cmp(c,
	363	btree_current_write(best),
	364	btree_current_write(b))) {
	365	rw_unlock(true, best);
	366	best = b;
	367	} else
	368	rw_unlock(true, b);
	369	}
	370
	371	if (best)
	372	goto out;
	373
	374	/* We can't find the best btree node, just pick the first */
	375	list_for_each_entry(b, &c->btree_cache, list)
	376	if (!b->level && btree_node_dirty(b)) {
	377	best = b;
	378	rw_lock(true, best, best->level);
	379	goto found;
	380	}
	381
	382	out:
	383	if (!best)
	384	return;
	385	found:
	386	if (btree_node_dirty(best))
	387	bch_btree_write(best, true, NULL);
	388	rw_unlock(true, best);
	389	}
	390
	391	#define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1)
	392
	393	static void journal_discard_endio(struct bio *bio, int error)
	394	{
	395	struct journal_device *ja =
	396	container_of(bio, struct journal_device, discard_bio);
	397	struct cache *ca = container_of(ja, struct cache, journal);
	398
	399	atomic_set(&ja->discard_in_flight, DISCARD_DONE);
	400
	401	closure_wake_up(&ca->set->journal.wait);
	402	closure_put(&ca->set->cl);
	403	}
	404
	405	static void journal_discard_work(struct work_struct *work)
	406	{
	407	struct journal_device *ja =
	408	container_of(work, struct journal_device, discard_work);
	409
	410	submit_bio(0, &ja->discard_bio);
	411	}
	412
	413	static void do_journal_discard(struct cache *ca)
	414	{
	415	struct journal_device *ja = &ca->journal;
	416	struct bio *bio = &ja->discard_bio;
	417
	418	if (!ca->discard) {
	419	ja->discard_idx = ja->last_idx;
	420	return;
	421	}
	422
	423	switch (atomic_read(&ja->discard_in_flight) == DISCARD_IN_FLIGHT) {
	424	case DISCARD_IN_FLIGHT:
	425	return;
	426
	427	case DISCARD_DONE:
	428	ja->discard_idx = (ja->discard_idx + 1) %
	429	ca->sb.njournal_buckets;
	430
	431	atomic_set(&ja->discard_in_flight, DISCARD_READY);
	432	/* fallthrough */
	433
	434	case DISCARD_READY:
	435	if (ja->discard_idx == ja->last_idx)
	436	return;
	437
	438	atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT);
	439
	440	bio_init(bio);
	441	bio->bi_sector = bucket_to_sector(ca->set,
	442	ca->sb.d[ja->discard_idx]);
	443	bio->bi_bdev = ca->bdev;
	444	bio->bi_rw = REQ_WRITE\|REQ_DISCARD;
	445	bio->bi_max_vecs = 1;
	446	bio->bi_io_vec = bio->bi_inline_vecs;
	447	bio->bi_size = bucket_bytes(ca);
	448	bio->bi_end_io = journal_discard_endio;
	449
	450	closure_get(&ca->set->cl);
	451	INIT_WORK(&ja->discard_work, journal_discard_work);
	452	schedule_work(&ja->discard_work);
	453	}
	454	}
	455
	456	static void journal_reclaim(struct cache_set *c)
	457	{
	458	struct bkey *k = &c->journal.key;
	459	struct cache *ca;
	460	uint64_t last_seq;
	461	unsigned iter, n = 0;
	462	atomic_t p;
	463
	464	while (!atomic_read(&fifo_front(&c->journal.pin)))
	465	fifo_pop(&c->journal.pin, p);
	466
	467	last_seq = last_seq(&c->journal);
	468
	469	/* Update last_idx */
	470
	471	for_each_cache(ca, c, iter) {
	472	struct journal_device *ja = &ca->journal;
	473
	474	while (ja->last_idx != ja->cur_idx &&
	475	ja->seq[ja->last_idx] < last_seq)
	476	ja->last_idx = (ja->last_idx + 1) %
	477	ca->sb.njournal_buckets;
	478	}
	479
	480	for_each_cache(ca, c, iter)
	481	do_journal_discard(ca);
	482
	483	if (c->journal.blocks_free)
	484	return;
	485
	486	/*
	487	* Allocate:
	488	* XXX: Sort by free journal space
	489	*/
	490
	491	for_each_cache(ca, c, iter) {
	492	struct journal_device *ja = &ca->journal;
	493	unsigned next = (ja->cur_idx + 1) % ca->sb.njournal_buckets;
	494
	495	/* No space available on this device */
	496	if (next == ja->discard_idx)
	497	continue;
	498
	499	ja->cur_idx = next;
	500	k->ptr[n++] = PTR(0,
	501	bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
	502	ca->sb.nr_this_dev);
	503	}
	504
	505	bkey_init(k);
	506	SET_KEY_PTRS(k, n);
	507
	508	if (n)
	509	c->journal.blocks_free = c->sb.bucket_size >> c->block_bits;
	510
	511	if (!journal_full(&c->journal))
	512	__closure_wake_up(&c->journal.wait);
	513	}
	514
	515	void bch_journal_next(struct journal *j)
	516	{
	517	atomic_t p = { 1 };
	518
	519	j->cur = (j->cur == j->w)
	520	? &j->w[1]
	521	: &j->w[0];
	522
	523	/*
	524	* The fifo_push() needs to happen at the same time as j->seq is
	525	* incremented for last_seq() to be calculated correctly
	526	*/
	527	BUG_ON(!fifo_push(&j->pin, p));
	528	atomic_set(&fifo_back(&j->pin), 1);
	529
	530	j->cur->data->seq = ++j->seq;
	531	j->cur->need_write = false;
	532	j->cur->data->keys = 0;
	533
	534	if (fifo_full(&j->pin))
	535	pr_debug("journal_pin full (%zu)", fifo_used(&j->pin));
	536	}
	537
	538	static void journal_write_endio(struct bio *bio, int error)
	539	{
	540	struct journal_write *w = bio->bi_private;
	541
	542	cache_set_err_on(error, w->c, "journal io error");
	543	closure_put(&w->c->journal.io.cl);
	544	}
	545
	546	static void journal_write(struct closure *);
	547
	548	static void journal_write_done(struct closure *cl)
	549	{
	550	struct journal *j = container_of(cl, struct journal, io.cl);
	551	struct cache_set *c = container_of(j, struct cache_set, journal);
	552
	553	struct journal_write *w = (j->cur == j->w)
	554	? &j->w[1]
	555	: &j->w[0];
	556
	557	__closure_wake_up(&w->wait);
	558
	559	if (c->journal_delay_ms)
	560	closure_delay(&j->io, msecs_to_jiffies(c->journal_delay_ms));
	561
	562	continue_at(cl, journal_write, system_wq);
	563	}
	564
	565	static void journal_write_unlocked(struct closure *cl)
	566	{
	567	struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl);
	568	struct cache *ca;
	569	struct journal_write *w = c->journal.cur;
	570	struct bkey *k = &c->journal.key;
	571	unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size;
	572
	573	struct bio *bio;
	574	struct bio_list list;
	575	bio_list_init(&list);
	576
	577	if (!w->need_write) {
	578	/*
	579	* XXX: have to unlock closure before we unlock journal lock,
	580	* else we race with bch_journal(). But this way we race
	581	* against cache set unregister. Doh.
	582	*/
	583	set_closure_fn(cl, NULL, NULL);
	584	closure_sub(cl, CLOSURE_RUNNING + 1);
	585	spin_unlock(&c->journal.lock);
	586	return;
	587	} else if (journal_full(&c->journal)) {
	588	journal_reclaim(c);
	589	spin_unlock(&c->journal.lock);
	590
	591	btree_flush_write(c);
	592	continue_at(cl, journal_write, system_wq);
	593	}
	594
	595	c->journal.blocks_free -= set_blocks(w->data, c);
	596
	597	w->data->btree_level = c->root->level;
	598
	599	bkey_copy(&w->data->btree_root, &c->root->key);
	600	bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket);
	601
	602	for_each_cache(ca, c, i)
	603	w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
	604
	605	w->data->magic = jset_magic(c);
	606	w->data->version = BCACHE_JSET_VERSION;
	607	w->data->last_seq = last_seq(&c->journal);
	608	w->data->csum = csum_set(w->data);
	609
	610	for (i = 0; i < KEY_PTRS(k); i++) {
	611	ca = PTR_CACHE(c, k, i);
	612	bio = &ca->journal.bio;
	613
	614	atomic_long_add(sectors, &ca->meta_sectors_written);
	615
	616	bio_reset(bio);
	617	bio->bi_sector = PTR_OFFSET(k, i);
	618	bio->bi_bdev = ca->bdev;
	619	bio->bi_rw = REQ_WRITE\|REQ_SYNC\|REQ_META\|REQ_FLUSH;
	620	bio->bi_size = sectors << 9;
	621
	622	bio->bi_end_io = journal_write_endio;
	623	bio->bi_private = w;
	624	bio_map(bio, w->data);
	625
	626	trace_bcache_journal_write(bio);
	627	bio_list_add(&list, bio);
	628
	629	SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors);
	630
	631	ca->journal.seq[ca->journal.cur_idx] = w->data->seq;
	632	}
	633
	634	atomic_dec_bug(&fifo_back(&c->journal.pin));
	635	bch_journal_next(&c->journal);
	636	journal_reclaim(c);
	637
	638	spin_unlock(&c->journal.lock);
	639
	640	while ((bio = bio_list_pop(&list)))
	641	closure_bio_submit(bio, cl, c->cache[0]);
	642
	643	continue_at(cl, journal_write_done, NULL);
	644	}
	645
	646	static void journal_write(struct closure *cl)
	647	{
	648	struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl);
	649
	650	spin_lock(&c->journal.lock);
	651	journal_write_unlocked(cl);
	652	}
	653
	654	static void __journal_try_write(struct cache_set *c, bool noflush)
	655	{
	656	struct closure *cl = &c->journal.io.cl;
	657
	658	if (!closure_trylock(cl, &c->cl))
	659	spin_unlock(&c->journal.lock);
	660	else if (noflush && journal_full(&c->journal)) {
	661	spin_unlock(&c->journal.lock);
	662	continue_at(cl, journal_write, system_wq);
	663	} else
	664	journal_write_unlocked(cl);
	665	}
	666
	667	#define journal_try_write(c) __journal_try_write(c, false)
	668
	669	void bch_journal_meta(struct cache_set c, struct closure cl)
	670	{
	671	struct journal_write *w;
	672
	673	if (CACHE_SYNC(&c->sb)) {
	674	spin_lock(&c->journal.lock);
	675
	676	w = c->journal.cur;
	677	w->need_write = true;
	678
	679	if (cl)
	680	BUG_ON(!closure_wait(&w->wait, cl));
	681
	682	__journal_try_write(c, true);
	683	}
	684	}
	685
	686	/*
	687	* Entry point to the journalling code - bio_insert() and btree_invalidate()
	688	* pass bch_journal() a list of keys to be journalled, and then
	689	* bch_journal() hands those same keys off to btree_insert_async()
	690	*/
	691
	692	void bch_journal(struct closure *cl)
	693	{
	694	struct btree_op *op = container_of(cl, struct btree_op, cl);
	695	struct cache_set *c = op->c;
	696	struct journal_write *w;
	697	size_t b, n = ((uint64_t *) op->keys.top) - op->keys.list;
	698
	699	if (op->type != BTREE_INSERT \|\|
	700	!CACHE_SYNC(&c->sb))
	701	goto out;
	702
	703	/*
	704	* If we're looping because we errored, might already be waiting on
	705	* another journal write:
	706	*/
	707	while (atomic_read(&cl->parent->remaining) & CLOSURE_WAITING)
	708	closure_sync(cl->parent);
	709
	710	spin_lock(&c->journal.lock);
	711
	712	if (journal_full(&c->journal)) {
	713	/* XXX: tracepoint */
	714	closure_wait(&c->journal.wait, cl);
	715
	716	journal_reclaim(c);
	717	spin_unlock(&c->journal.lock);
	718
	719	btree_flush_write(c);
	720	continue_at(cl, bch_journal, bcache_wq);
	721	}
	722
	723	w = c->journal.cur;
	724	w->need_write = true;
	725	b = __set_blocks(w->data, w->data->keys + n, c);
	726
	727	if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS \|\|
	728	b > c->journal.blocks_free) {
	729	/* XXX: If we were inserting so many keys that they won't fit in
	730	* an _empty_ journal write, we'll deadlock. For now, handle
	731	* this in bch_keylist_realloc() - but something to think about.
	732	*/
	733	BUG_ON(!w->data->keys);
	734
	735	/* XXX: tracepoint */
	736	BUG_ON(!closure_wait(&w->wait, cl));
	737
	738	closure_flush(&c->journal.io);
	739
	740	journal_try_write(c);
	741	continue_at(cl, bch_journal, bcache_wq);
	742	}
	743
	744	memcpy(end(w->data), op->keys.list, n * sizeof(uint64_t));
	745	w->data->keys += n;
	746
	747	op->journal = &fifo_back(&c->journal.pin);
	748	atomic_inc(op->journal);
	749
	750	if (op->flush_journal) {
	751	closure_flush(&c->journal.io);
	752	closure_wait(&w->wait, cl->parent);
	753	}
	754
	755	journal_try_write(c);
	756	out:
	757	bch_btree_insert_async(cl);
	758	}
	759
	760	void bch_journal_free(struct cache_set *c)
	761	{
	762	free_pages((unsigned long) c->journal.w[1].data, JSET_BITS);
	763	free_pages((unsigned long) c->journal.w[0].data, JSET_BITS);
	764	free_fifo(&c->journal.pin);
	765	}
	766
	767	int bch_journal_alloc(struct cache_set *c)
	768	{
	769	struct journal *j = &c->journal;
	770
	771	closure_init_unlocked(&j->io);
	772	spin_lock_init(&j->lock);
	773
	774	c->journal_delay_ms = 100;
	775
	776	j->w[0].c = c;
	777	j->w[1].c = c;
	778
	779	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) \|\|
	780	!(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) \|\|
	781	!(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)))
	782	return -ENOMEM;
	783
	784	return 0;
	785	}