aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKent Overstreet <kmo@daterainc.com>2013-10-31 18:46:42 -0400
committerKent Overstreet <kmo@daterainc.com>2013-11-11 00:56:33 -0500
commit81ab4190ac17df41686a37c97f701623276b652a (patch)
treebe7f48b5ad6d36fadbac4658d37953d324d760d0
parent2599b53b7b0ea6103d1661dca74d35480cb8fa1f (diff)
bcache: Pull on disk data structures out into a separate header
Now, the on disk data structures are in a header that can be exported to userspace - and having them all centralized is nice too. Signed-off-by: Kent Overstreet <kmo@daterainc.com>
-rw-r--r--drivers/md/bcache/bcache.h244
-rw-r--r--drivers/md/bcache/bset.c4
-rw-r--r--drivers/md/bcache/bset.h31
-rw-r--r--drivers/md/bcache/btree.c2
-rw-r--r--drivers/md/bcache/journal.c4
-rw-r--r--drivers/md/bcache/journal.h37
-rw-r--r--drivers/md/bcache/request.c9
-rw-r--r--drivers/md/bcache/super.c13
-rw-r--r--drivers/md/bcache/util.h10
-rw-r--r--include/uapi/linux/bcache.h373
10 files changed, 387 insertions, 340 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index e32f6fd91755..045cb99f1ca6 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -177,6 +177,7 @@
177 177
178#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ 178#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
179 179
180#include <linux/bcache.h>
180#include <linux/bio.h> 181#include <linux/bio.h>
181#include <linux/kobject.h> 182#include <linux/kobject.h>
182#include <linux/list.h> 183#include <linux/list.h>
@@ -210,168 +211,6 @@ BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2);
210#define GC_MARK_METADATA 2 211#define GC_MARK_METADATA 2
211BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14); 212BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14);
212 213
213struct bkey {
214 uint64_t high;
215 uint64_t low;
216 uint64_t ptr[];
217};
218
219/* Enough for a key with 6 pointers */
220#define BKEY_PAD 8
221
222#define BKEY_PADDED(key) \
223 union { struct bkey key; uint64_t key ## _pad[BKEY_PAD]; }
224
225/* Version 0: Cache device
226 * Version 1: Backing device
227 * Version 2: Seed pointer into btree node checksum
228 * Version 3: Cache device with new UUID format
229 * Version 4: Backing device with data offset
230 */
231#define BCACHE_SB_VERSION_CDEV 0
232#define BCACHE_SB_VERSION_BDEV 1
233#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3
234#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4
235#define BCACHE_SB_MAX_VERSION 4
236
237#define SB_SECTOR 8
238#define SB_SIZE 4096
239#define SB_LABEL_SIZE 32
240#define SB_JOURNAL_BUCKETS 256U
241/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */
242#define MAX_CACHES_PER_SET 8
243
244#define BDEV_DATA_START_DEFAULT 16 /* sectors */
245
246struct cache_sb {
247 uint64_t csum;
248 uint64_t offset; /* sector where this sb was written */
249 uint64_t version;
250
251 uint8_t magic[16];
252
253 uint8_t uuid[16];
254 union {
255 uint8_t set_uuid[16];
256 uint64_t set_magic;
257 };
258 uint8_t label[SB_LABEL_SIZE];
259
260 uint64_t flags;
261 uint64_t seq;
262 uint64_t pad[8];
263
264 union {
265 struct {
266 /* Cache devices */
267 uint64_t nbuckets; /* device size */
268
269 uint16_t block_size; /* sectors */
270 uint16_t bucket_size; /* sectors */
271
272 uint16_t nr_in_set;
273 uint16_t nr_this_dev;
274 };
275 struct {
276 /* Backing devices */
277 uint64_t data_offset;
278
279 /*
280 * block_size from the cache device section is still used by
281 * backing devices, so don't add anything here until we fix
282 * things to not need it for backing devices anymore
283 */
284 };
285 };
286
287 uint32_t last_mount; /* time_t */
288
289 uint16_t first_bucket;
290 union {
291 uint16_t njournal_buckets;
292 uint16_t keys;
293 };
294 uint64_t d[SB_JOURNAL_BUCKETS]; /* journal buckets */
295};
296
297BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1);
298BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1);
299BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3);
300#define CACHE_REPLACEMENT_LRU 0U
301#define CACHE_REPLACEMENT_FIFO 1U
302#define CACHE_REPLACEMENT_RANDOM 2U
303
304BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4);
305#define CACHE_MODE_WRITETHROUGH 0U
306#define CACHE_MODE_WRITEBACK 1U
307#define CACHE_MODE_WRITEAROUND 2U
308#define CACHE_MODE_NONE 3U
309BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2);
310#define BDEV_STATE_NONE 0U
311#define BDEV_STATE_CLEAN 1U
312#define BDEV_STATE_DIRTY 2U
313#define BDEV_STATE_STALE 3U
314
315/* Version 1: Seed pointer into btree node checksum
316 */
317#define BCACHE_BSET_VERSION 1
318
319/*
320 * This is the on disk format for btree nodes - a btree node on disk is a list
321 * of these; within each set the keys are sorted
322 */
323struct bset {
324 uint64_t csum;
325 uint64_t magic;
326 uint64_t seq;
327 uint32_t version;
328 uint32_t keys;
329
330 union {
331 struct bkey start[0];
332 uint64_t d[0];
333 };
334};
335
336/*
337 * On disk format for priorities and gens - see super.c near prio_write() for
338 * more.
339 */
340struct prio_set {
341 uint64_t csum;
342 uint64_t magic;
343 uint64_t seq;
344 uint32_t version;
345 uint32_t pad;
346
347 uint64_t next_bucket;
348
349 struct bucket_disk {
350 uint16_t prio;
351 uint8_t gen;
352 } __attribute((packed)) data[];
353};
354
355struct uuid_entry {
356 union {
357 struct {
358 uint8_t uuid[16];
359 uint8_t label[32];
360 uint32_t first_reg;
361 uint32_t last_reg;
362 uint32_t invalidated;
363
364 uint32_t flags;
365 /* Size of flash only volumes */
366 uint64_t sectors;
367 };
368
369 uint8_t pad[128];
370 };
371};
372
373BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1);
374
375#include "journal.h" 214#include "journal.h"
376#include "stats.h" 215#include "stats.h"
377struct search; 216struct search;
@@ -868,12 +707,6 @@ static inline bool key_merging_disabled(struct cache_set *c)
868#endif 707#endif
869} 708}
870 709
871static inline bool SB_IS_BDEV(const struct cache_sb *sb)
872{
873 return sb->version == BCACHE_SB_VERSION_BDEV
874 || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET;
875}
876
877struct bbio { 710struct bbio {
878 unsigned submit_time_us; 711 unsigned submit_time_us;
879 union { 712 union {
@@ -927,59 +760,6 @@ static inline unsigned local_clock_us(void)
927#define prio_buckets(c) \ 760#define prio_buckets(c) \
928 DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c)) 761 DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c))
929 762
930#define JSET_MAGIC 0x245235c1a3625032ULL
931#define PSET_MAGIC 0x6750e15f87337f91ULL
932#define BSET_MAGIC 0x90135c78b99e07f5ULL
933
934#define jset_magic(c) ((c)->sb.set_magic ^ JSET_MAGIC)
935#define pset_magic(c) ((c)->sb.set_magic ^ PSET_MAGIC)
936#define bset_magic(c) ((c)->sb.set_magic ^ BSET_MAGIC)
937
938/* Bkey fields: all units are in sectors */
939
940#define KEY_FIELD(name, field, offset, size) \
941 BITMASK(name, struct bkey, field, offset, size)
942
943#define PTR_FIELD(name, offset, size) \
944 static inline uint64_t name(const struct bkey *k, unsigned i) \
945 { return (k->ptr[i] >> offset) & ~(((uint64_t) ~0) << size); } \
946 \
947 static inline void SET_##name(struct bkey *k, unsigned i, uint64_t v)\
948 { \
949 k->ptr[i] &= ~(~((uint64_t) ~0 << size) << offset); \
950 k->ptr[i] |= v << offset; \
951 }
952
953KEY_FIELD(KEY_PTRS, high, 60, 3)
954KEY_FIELD(HEADER_SIZE, high, 58, 2)
955KEY_FIELD(KEY_CSUM, high, 56, 2)
956KEY_FIELD(KEY_PINNED, high, 55, 1)
957KEY_FIELD(KEY_DIRTY, high, 36, 1)
958
959KEY_FIELD(KEY_SIZE, high, 20, 16)
960KEY_FIELD(KEY_INODE, high, 0, 20)
961
962/* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */
963
964static inline uint64_t KEY_OFFSET(const struct bkey *k)
965{
966 return k->low;
967}
968
969static inline void SET_KEY_OFFSET(struct bkey *k, uint64_t v)
970{
971 k->low = v;
972}
973
974PTR_FIELD(PTR_DEV, 51, 12)
975PTR_FIELD(PTR_OFFSET, 8, 43)
976PTR_FIELD(PTR_GEN, 0, 8)
977
978#define PTR_CHECK_DEV ((1 << 12) - 1)
979
980#define PTR(gen, offset, dev) \
981 ((((uint64_t) dev) << 51) | ((uint64_t) offset) << 8 | gen)
982
983static inline size_t sector_to_bucket(struct cache_set *c, sector_t s) 763static inline size_t sector_to_bucket(struct cache_set *c, sector_t s)
984{ 764{
985 return s >> c->bucket_bits; 765 return s >> c->bucket_bits;
@@ -1018,31 +798,11 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c,
1018 798
1019/* Btree key macros */ 799/* Btree key macros */
1020 800
1021/*
1022 * The high bit being set is a relic from when we used it to do binary
1023 * searches - it told you where a key started. It's not used anymore,
1024 * and can probably be safely dropped.
1025 */
1026#define KEY(dev, sector, len) \
1027((struct bkey) { \
1028 .high = (1ULL << 63) | ((uint64_t) (len) << 20) | (dev), \
1029 .low = (sector) \
1030})
1031
1032static inline void bkey_init(struct bkey *k) 801static inline void bkey_init(struct bkey *k)
1033{ 802{
1034 *k = KEY(0, 0, 0); 803 *k = ZERO_KEY;
1035} 804}
1036 805
1037#define KEY_START(k) (KEY_OFFSET(k) - KEY_SIZE(k))
1038#define START_KEY(k) KEY(KEY_INODE(k), KEY_START(k), 0)
1039
1040#define MAX_KEY_INODE (~(~0 << 20))
1041#define MAX_KEY_OFFSET (((uint64_t) ~0) >> 1)
1042#define MAX_KEY KEY(MAX_KEY_INODE, MAX_KEY_OFFSET, 0)
1043
1044#define ZERO_KEY KEY(0, 0, 0)
1045
1046/* 806/*
1047 * This is used for various on disk data structures - cache_sb, prio_set, bset, 807 * This is used for various on disk data structures - cache_sb, prio_set, bset,
1048 * jset: The checksum is _always_ the first 8 bytes of these structs 808 * jset: The checksum is _always_ the first 8 bytes of these structs
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index f7b5525ddafa..7b8713c66050 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -684,7 +684,7 @@ void bch_bset_init_next(struct btree *b)
684 } else 684 } else
685 get_random_bytes(&i->seq, sizeof(uint64_t)); 685 get_random_bytes(&i->seq, sizeof(uint64_t));
686 686
687 i->magic = bset_magic(b->c); 687 i->magic = bset_magic(&b->c->sb);
688 i->version = 0; 688 i->version = 0;
689 i->keys = 0; 689 i->keys = 0;
690 690
@@ -1034,7 +1034,7 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter,
1034 * memcpy() 1034 * memcpy()
1035 */ 1035 */
1036 1036
1037 out->magic = bset_magic(b->c); 1037 out->magic = bset_magic(&b->c->sb);
1038 out->seq = b->sets[0].data->seq; 1038 out->seq = b->sets[0].data->seq;
1039 out->version = b->sets[0].data->version; 1039 out->version = b->sets[0].data->version;
1040 swap(out, b->sets[0].data); 1040 swap(out, b->sets[0].data);
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index 8a9305685b7e..5cd90565dfe2 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -193,37 +193,6 @@ static __always_inline int64_t bkey_cmp(const struct bkey *l,
193 : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); 193 : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r);
194} 194}
195 195
196static inline size_t bkey_u64s(const struct bkey *k)
197{
198 BUG_ON(KEY_CSUM(k) > 1);
199 return 2 + KEY_PTRS(k) + (KEY_CSUM(k) ? 1 : 0);
200}
201
202static inline size_t bkey_bytes(const struct bkey *k)
203{
204 return bkey_u64s(k) * sizeof(uint64_t);
205}
206
207static inline void bkey_copy(struct bkey *dest, const struct bkey *src)
208{
209 memcpy(dest, src, bkey_bytes(src));
210}
211
212static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src)
213{
214 if (!src)
215 src = &KEY(0, 0, 0);
216
217 SET_KEY_INODE(dest, KEY_INODE(src));
218 SET_KEY_OFFSET(dest, KEY_OFFSET(src));
219}
220
221static inline struct bkey *bkey_next(const struct bkey *k)
222{
223 uint64_t *d = (void *) k;
224 return (struct bkey *) (d + bkey_u64s(k));
225}
226
227/* Keylists */ 196/* Keylists */
228 197
229struct keylist { 198struct keylist {
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index f5aa4adadf1d..aba787d954e5 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -231,7 +231,7 @@ static void bch_btree_node_read_done(struct btree *b)
231 goto err; 231 goto err;
232 232
233 err = "bad magic"; 233 err = "bad magic";
234 if (i->magic != bset_magic(b->c)) 234 if (i->magic != bset_magic(&b->c->sb))
235 goto err; 235 goto err;
236 236
237 err = "bad checksum"; 237 err = "bad checksum";
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 86de64a6bf26..ecdaa671bd50 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -74,7 +74,7 @@ reread: left = ca->sb.bucket_size - offset;
74 struct list_head *where; 74 struct list_head *where;
75 size_t blocks, bytes = set_bytes(j); 75 size_t blocks, bytes = set_bytes(j);
76 76
77 if (j->magic != jset_magic(ca->set)) 77 if (j->magic != jset_magic(&ca->sb))
78 return ret; 78 return ret;
79 79
80 if (bytes > left << 9) 80 if (bytes > left << 9)
@@ -596,7 +596,7 @@ static void journal_write_unlocked(struct closure *cl)
596 for_each_cache(ca, c, i) 596 for_each_cache(ca, c, i)
597 w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; 597 w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
598 598
599 w->data->magic = jset_magic(c); 599 w->data->magic = jset_magic(&c->sb);
600 w->data->version = BCACHE_JSET_VERSION; 600 w->data->version = BCACHE_JSET_VERSION;
601 w->data->last_seq = last_seq(&c->journal); 601 w->data->last_seq = last_seq(&c->journal);
602 w->data->csum = csum_set(w->data); 602 w->data->csum = csum_set(w->data);
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
index 5e9edb9ef376..a6472fda94b2 100644
--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@@ -75,43 +75,6 @@
75 * nodes that are pinning the oldest journal entries first. 75 * nodes that are pinning the oldest journal entries first.
76 */ 76 */
77 77
78#define BCACHE_JSET_VERSION_UUIDv1 1
79/* Always latest UUID format */
80#define BCACHE_JSET_VERSION_UUID 1
81#define BCACHE_JSET_VERSION 1
82
83/*
84 * On disk format for a journal entry:
85 * seq is monotonically increasing; every journal entry has its own unique
86 * sequence number.
87 *
88 * last_seq is the oldest journal entry that still has keys the btree hasn't
89 * flushed to disk yet.
90 *
91 * version is for on disk format changes.
92 */
93struct jset {
94 uint64_t csum;
95 uint64_t magic;
96 uint64_t seq;
97 uint32_t version;
98 uint32_t keys;
99
100 uint64_t last_seq;
101
102 BKEY_PADDED(uuid_bucket);
103 BKEY_PADDED(btree_root);
104 uint16_t btree_level;
105 uint16_t pad[3];
106
107 uint64_t prio_bucket[MAX_CACHES_PER_SET];
108
109 union {
110 struct bkey start[0];
111 uint64_t d[0];
112 };
113};
114
115/* 78/*
116 * Only used for holding the journal entries we read in btree_journal_read() 79 * Only used for holding the journal entries we read in btree_journal_read()
117 * during cache_registration 80 * during cache_registration
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index cf7850a7592c..932300f18973 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -264,16 +264,17 @@ static void bch_data_invalidate(struct closure *cl)
264 bio_sectors(bio), (uint64_t) bio->bi_sector); 264 bio_sectors(bio), (uint64_t) bio->bi_sector);
265 265
266 while (bio_sectors(bio)) { 266 while (bio_sectors(bio)) {
267 unsigned len = min(bio_sectors(bio), 1U << 14); 267 unsigned sectors = min(bio_sectors(bio),
268 1U << (KEY_SIZE_BITS - 1));
268 269
269 if (bch_keylist_realloc(&op->insert_keys, 0, op->c)) 270 if (bch_keylist_realloc(&op->insert_keys, 0, op->c))
270 goto out; 271 goto out;
271 272
272 bio->bi_sector += len; 273 bio->bi_sector += sectors;
273 bio->bi_size -= len << 9; 274 bio->bi_size -= sectors << 9;
274 275
275 bch_keylist_add(&op->insert_keys, 276 bch_keylist_add(&op->insert_keys,
276 &KEY(op->inode, bio->bi_sector, len)); 277 &KEY(op->inode, bio->bi_sector, sectors));
277 } 278 }
278 279
279 op->insert_data_done = true; 280 op->insert_data_done = true;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index a314c771263f..c67d19a8913d 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -45,15 +45,6 @@ const char * const bch_cache_modes[] = {
45 NULL 45 NULL
46}; 46};
47 47
48struct uuid_entry_v0 {
49 uint8_t uuid[16];
50 uint8_t label[32];
51 uint32_t first_reg;
52 uint32_t last_reg;
53 uint32_t invalidated;
54 uint32_t pad;
55};
56
57static struct kobject *bcache_kobj; 48static struct kobject *bcache_kobj;
58struct mutex bch_register_lock; 49struct mutex bch_register_lock;
59LIST_HEAD(bch_cache_sets); 50LIST_HEAD(bch_cache_sets);
@@ -562,7 +553,7 @@ void bch_prio_write(struct cache *ca)
562 } 553 }
563 554
564 p->next_bucket = ca->prio_buckets[i + 1]; 555 p->next_bucket = ca->prio_buckets[i + 1];
565 p->magic = pset_magic(ca); 556 p->magic = pset_magic(&ca->sb);
566 p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); 557 p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
567 558
568 bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, true); 559 bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, true);
@@ -613,7 +604,7 @@ static void prio_read(struct cache *ca, uint64_t bucket)
613 if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8)) 604 if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8))
614 pr_warn("bad csum reading priorities"); 605 pr_warn("bad csum reading priorities");
615 606
616 if (p->magic != pset_magic(ca)) 607 if (p->magic != pset_magic(&ca->sb))
617 pr_warn("bad magic reading priorities"); 608 pr_warn("bad magic reading priorities");
618 609
619 bucket = p->next_bucket; 610 bucket = p->next_bucket;
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index ea345c6896f4..38ae7a4ce928 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -27,16 +27,6 @@ struct closure;
27 27
28#endif 28#endif
29 29
30#define BITMASK(name, type, field, offset, size) \
31static inline uint64_t name(const type *k) \
32{ return (k->field >> offset) & ~(((uint64_t) ~0) << size); } \
33 \
34static inline void SET_##name(type *k, uint64_t v) \
35{ \
36 k->field &= ~(~((uint64_t) ~0 << size) << offset); \
37 k->field |= v << offset; \
38}
39
40#define DECLARE_HEAP(type, name) \ 30#define DECLARE_HEAP(type, name) \
41 struct { \ 31 struct { \
42 size_t size, used; \ 32 size_t size, used; \
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h
new file mode 100644
index 000000000000..164a7e263988
--- /dev/null
+++ b/include/uapi/linux/bcache.h
@@ -0,0 +1,373 @@
1#ifndef _LINUX_BCACHE_H
2#define _LINUX_BCACHE_H
3
4/*
5 * Bcache on disk data structures
6 */
7
8#include <asm/types.h>
9
10#define BITMASK(name, type, field, offset, size) \
11static inline __u64 name(const type *k) \
12{ return (k->field >> offset) & ~(~0ULL << size); } \
13 \
14static inline void SET_##name(type *k, __u64 v) \
15{ \
16 k->field &= ~(~(~0ULL << size) << offset); \
17 k->field |= (v & ~(~0ULL << size)) << offset; \
18}
19
20/* Btree keys - all units are in sectors */
21
22struct bkey {
23 __u64 high;
24 __u64 low;
25 __u64 ptr[];
26};
27
28#define KEY_FIELD(name, field, offset, size) \
29 BITMASK(name, struct bkey, field, offset, size)
30
31#define PTR_FIELD(name, offset, size) \
32static inline __u64 name(const struct bkey *k, unsigned i) \
33{ return (k->ptr[i] >> offset) & ~(~0ULL << size); } \
34 \
35static inline void SET_##name(struct bkey *k, unsigned i, __u64 v) \
36{ \
37 k->ptr[i] &= ~(~(~0ULL << size) << offset); \
38 k->ptr[i] |= (v & ~(~0ULL << size)) << offset; \
39}
40
41#define KEY_SIZE_BITS 16
42
43KEY_FIELD(KEY_PTRS, high, 60, 3)
44KEY_FIELD(HEADER_SIZE, high, 58, 2)
45KEY_FIELD(KEY_CSUM, high, 56, 2)
46KEY_FIELD(KEY_PINNED, high, 55, 1)
47KEY_FIELD(KEY_DIRTY, high, 36, 1)
48
49KEY_FIELD(KEY_SIZE, high, 20, KEY_SIZE_BITS)
50KEY_FIELD(KEY_INODE, high, 0, 20)
51
52/* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */
53
54static inline __u64 KEY_OFFSET(const struct bkey *k)
55{
56 return k->low;
57}
58
59static inline void SET_KEY_OFFSET(struct bkey *k, __u64 v)
60{
61 k->low = v;
62}
63
64/*
65 * The high bit being set is a relic from when we used it to do binary
66 * searches - it told you where a key started. It's not used anymore,
67 * and can probably be safely dropped.
68 */
69#define KEY(inode, offset, size) \
70((struct bkey) { \
71 .high = (1ULL << 63) | ((__u64) (size) << 20) | (inode), \
72 .low = (offset) \
73})
74
75#define ZERO_KEY KEY(0, 0, 0)
76
77#define MAX_KEY_INODE (~(~0 << 20))
78#define MAX_KEY_OFFSET (~0ULL >> 1)
79#define MAX_KEY KEY(MAX_KEY_INODE, MAX_KEY_OFFSET, 0)
80
81#define KEY_START(k) (KEY_OFFSET(k) - KEY_SIZE(k))
82#define START_KEY(k) KEY(KEY_INODE(k), KEY_START(k), 0)
83
84#define PTR_DEV_BITS 12
85
86PTR_FIELD(PTR_DEV, 51, PTR_DEV_BITS)
87PTR_FIELD(PTR_OFFSET, 8, 43)
88PTR_FIELD(PTR_GEN, 0, 8)
89
90#define PTR_CHECK_DEV ((1 << PTR_DEV_BITS) - 1)
91
92#define PTR(gen, offset, dev) \
93 ((((__u64) dev) << 51) | ((__u64) offset) << 8 | gen)
94
95/* Bkey utility code */
96
97static inline unsigned long bkey_u64s(const struct bkey *k)
98{
99 return (sizeof(struct bkey) / sizeof(__u64)) + KEY_PTRS(k);
100}
101
102static inline unsigned long bkey_bytes(const struct bkey *k)
103{
104 return bkey_u64s(k) * sizeof(__u64);
105}
106
107#define bkey_copy(_dest, _src) memcpy(_dest, _src, bkey_bytes(_src))
108
109static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src)
110{
111 SET_KEY_INODE(dest, KEY_INODE(src));
112 SET_KEY_OFFSET(dest, KEY_OFFSET(src));
113}
114
115static inline struct bkey *bkey_next(const struct bkey *k)
116{
117 __u64 *d = (void *) k;
118 return (struct bkey *) (d + bkey_u64s(k));
119}
120
121static inline struct bkey *bkey_last(const struct bkey *k, unsigned nr_keys)
122{
123 __u64 *d = (void *) k;
124 return (struct bkey *) (d + nr_keys);
125}
126/* Enough for a key with 6 pointers */
127#define BKEY_PAD 8
128
129#define BKEY_PADDED(key) \
130 union { struct bkey key; __u64 key ## _pad[BKEY_PAD]; }
131
132/* Superblock */
133
134/* Version 0: Cache device
135 * Version 1: Backing device
136 * Version 2: Seed pointer into btree node checksum
137 * Version 3: Cache device with new UUID format
138 * Version 4: Backing device with data offset
139 */
140#define BCACHE_SB_VERSION_CDEV 0
141#define BCACHE_SB_VERSION_BDEV 1
142#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3
143#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4
144#define BCACHE_SB_MAX_VERSION 4
145
146#define SB_SECTOR 8
147#define SB_SIZE 4096
148#define SB_LABEL_SIZE 32
149#define SB_JOURNAL_BUCKETS 256U
150/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */
151#define MAX_CACHES_PER_SET 8
152
153#define BDEV_DATA_START_DEFAULT 16 /* sectors */
154
155struct cache_sb {
156 __u64 csum;
157 __u64 offset; /* sector where this sb was written */
158 __u64 version;
159
160 __u8 magic[16];
161
162 __u8 uuid[16];
163 union {
164 __u8 set_uuid[16];
165 __u64 set_magic;
166 };
167 __u8 label[SB_LABEL_SIZE];
168
169 __u64 flags;
170 __u64 seq;
171 __u64 pad[8];
172
173 union {
174 struct {
175 /* Cache devices */
176 __u64 nbuckets; /* device size */
177
178 __u16 block_size; /* sectors */
179 __u16 bucket_size; /* sectors */
180
181 __u16 nr_in_set;
182 __u16 nr_this_dev;
183 };
184 struct {
185 /* Backing devices */
186 __u64 data_offset;
187
188 /*
189 * block_size from the cache device section is still used by
190 * backing devices, so don't add anything here until we fix
191 * things to not need it for backing devices anymore
192 */
193 };
194 };
195
196 __u32 last_mount; /* time_t */
197
198 __u16 first_bucket;
199 union {
200 __u16 njournal_buckets;
201 __u16 keys;
202 };
203 __u64 d[SB_JOURNAL_BUCKETS]; /* journal buckets */
204};
205
206static inline _Bool SB_IS_BDEV(const struct cache_sb *sb)
207{
208 return sb->version == BCACHE_SB_VERSION_BDEV
209 || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET;
210}
211
212BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1);
213BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1);
214BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3);
215#define CACHE_REPLACEMENT_LRU 0U
216#define CACHE_REPLACEMENT_FIFO 1U
217#define CACHE_REPLACEMENT_RANDOM 2U
218
219BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4);
220#define CACHE_MODE_WRITETHROUGH 0U
221#define CACHE_MODE_WRITEBACK 1U
222#define CACHE_MODE_WRITEAROUND 2U
223#define CACHE_MODE_NONE 3U
224BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2);
225#define BDEV_STATE_NONE 0U
226#define BDEV_STATE_CLEAN 1U
227#define BDEV_STATE_DIRTY 2U
228#define BDEV_STATE_STALE 3U
229
230/*
231 * Magic numbers
232 *
233 * The various other data structures have their own magic numbers, which are
234 * xored with the first part of the cache set's UUID
235 */
236
237#define JSET_MAGIC 0x245235c1a3625032ULL
238#define PSET_MAGIC 0x6750e15f87337f91ULL
239#define BSET_MAGIC 0x90135c78b99e07f5ULL
240
241static inline __u64 jset_magic(struct cache_sb *sb)
242{
243 return sb->set_magic ^ JSET_MAGIC;
244}
245
246static inline __u64 pset_magic(struct cache_sb *sb)
247{
248 return sb->set_magic ^ PSET_MAGIC;
249}
250
251static inline __u64 bset_magic(struct cache_sb *sb)
252{
253 return sb->set_magic ^ BSET_MAGIC;
254}
255
256/*
257 * Journal
258 *
259 * On disk format for a journal entry:
260 * seq is monotonically increasing; every journal entry has its own unique
261 * sequence number.
262 *
263 * last_seq is the oldest journal entry that still has keys the btree hasn't
264 * flushed to disk yet.
265 *
266 * version is for on disk format changes.
267 */
268
269#define BCACHE_JSET_VERSION_UUIDv1 1
270#define BCACHE_JSET_VERSION_UUID 1 /* Always latest UUID format */
271#define BCACHE_JSET_VERSION 1
272
273struct jset {
274 __u64 csum;
275 __u64 magic;
276 __u64 seq;
277 __u32 version;
278 __u32 keys;
279
280 __u64 last_seq;
281
282 BKEY_PADDED(uuid_bucket);
283 BKEY_PADDED(btree_root);
284 __u16 btree_level;
285 __u16 pad[3];
286
287 __u64 prio_bucket[MAX_CACHES_PER_SET];
288
289 union {
290 struct bkey start[0];
291 __u64 d[0];
292 };
293};
294
295/* Bucket prios/gens */
296
297struct prio_set {
298 __u64 csum;
299 __u64 magic;
300 __u64 seq;
301 __u32 version;
302 __u32 pad;
303
304 __u64 next_bucket;
305
306 struct bucket_disk {
307 __u16 prio;
308 __u8 gen;
309 } __attribute((packed)) data[];
310};
311
312/* UUIDS - per backing device/flash only volume metadata */
313
314struct uuid_entry {
315 union {
316 struct {
317 __u8 uuid[16];
318 __u8 label[32];
319 __u32 first_reg;
320 __u32 last_reg;
321 __u32 invalidated;
322
323 __u32 flags;
324 /* Size of flash only volumes */
325 __u64 sectors;
326 };
327
328 __u8 pad[128];
329 };
330};
331
332BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1);
333
334/* Btree nodes */
335
336/* Version 1: Seed pointer into btree node checksum
337 */
338#define BCACHE_BSET_CSUM 1
339#define BCACHE_BSET_VERSION 1
340
341/*
342 * Btree nodes
343 *
344 * On disk a btree node is a list/log of these; within each set the keys are
345 * sorted
346 */
347struct bset {
348 __u64 csum;
349 __u64 magic;
350 __u64 seq;
351 __u32 version;
352 __u32 keys;
353
354 union {
355 struct bkey start[0];
356 __u64 d[0];
357 };
358};
359
360/* OBSOLETE */
361
362/* UUIDS - per backing device/flash only volume metadata */
363
364struct uuid_entry_v0 {
365 __u8 uuid[16];
366 __u8 label[32];
367 __u32 first_reg;
368 __u32 last_reg;
369 __u32 invalidated;
370 __u32 pad;
371};
372
373#endif /* _LINUX_BCACHE_H */