diff options
Diffstat (limited to 'drivers/md/bcache/btree.h')
-rw-r--r-- | drivers/md/bcache/btree.h | 405 |
1 files changed, 405 insertions, 0 deletions
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h new file mode 100644 index 000000000000..af4a7092a28c --- /dev/null +++ b/drivers/md/bcache/btree.h | |||
@@ -0,0 +1,405 @@ | |||
1 | #ifndef _BCACHE_BTREE_H | ||
2 | #define _BCACHE_BTREE_H | ||
3 | |||
4 | /* | ||
5 | * THE BTREE: | ||
6 | * | ||
7 | * At a high level, bcache's btree is relatively standard b+ tree. All keys and | ||
8 | * pointers are in the leaves; interior nodes only have pointers to the child | ||
9 | * nodes. | ||
10 | * | ||
11 | * In the interior nodes, a struct bkey always points to a child btree node, and | ||
12 | * the key is the highest key in the child node - except that the highest key in | ||
13 | * an interior node is always MAX_KEY. The size field refers to the size on disk | ||
14 | * of the child node - this would allow us to have variable sized btree nodes | ||
15 | * (handy for keeping the depth of the btree 1 by expanding just the root). | ||
16 | * | ||
17 | * Btree nodes are themselves log structured, but this is hidden fairly | ||
18 | * thoroughly. Btree nodes on disk will in practice have extents that overlap | ||
19 | * (because they were written at different times), but in memory we never have | ||
20 | * overlapping extents - when we read in a btree node from disk, the first thing | ||
21 | * we do is resort all the sets of keys with a mergesort, and in the same pass | ||
22 | * we check for overlapping extents and adjust them appropriately. | ||
23 | * | ||
24 | * struct btree_op is a central interface to the btree code. It's used for | ||
25 | * specifying read vs. write locking, and the embedded closure is used for | ||
26 | * waiting on IO or reserve memory. | ||
27 | * | ||
28 | * BTREE CACHE: | ||
29 | * | ||
30 | * Btree nodes are cached in memory; traversing the btree might require reading | ||
31 | * in btree nodes which is handled mostly transparently. | ||
32 | * | ||
33 | * bch_btree_node_get() looks up a btree node in the cache and reads it in from | ||
34 | * disk if necessary. This function is almost never called directly though - the | ||
35 | * btree() macro is used to get a btree node, call some function on it, and | ||
36 | * unlock the node after the function returns. | ||
37 | * | ||
38 | * The root is special cased - it's taken out of the cache's lru (thus pinning | ||
39 | * it in memory), so we can find the root of the btree by just dereferencing a | ||
40 | * pointer instead of looking it up in the cache. This makes locking a bit | ||
41 | * tricky, since the root pointer is protected by the lock in the btree node it | ||
42 | * points to - the btree_root() macro handles this. | ||
43 | * | ||
44 | * In various places we must be able to allocate memory for multiple btree nodes | ||
45 | * in order to make forward progress. To do this we use the btree cache itself | ||
46 | * as a reserve; if __get_free_pages() fails, we'll find a node in the btree | ||
47 | * cache we can reuse. We can't allow more than one thread to be doing this at a | ||
48 | * time, so there's a lock, implemented by a pointer to the btree_op closure - | ||
49 | * this allows the btree_root() macro to implicitly release this lock. | ||
50 | * | ||
51 | * BTREE IO: | ||
52 | * | ||
53 | * Btree nodes never have to be explicitly read in; bch_btree_node_get() handles | ||
54 | * this. | ||
55 | * | ||
56 | * For writing, we have two btree_write structs embeddded in struct btree - one | ||
57 | * write in flight, and one being set up, and we toggle between them. | ||
58 | * | ||
59 | * Writing is done with a single function - bch_btree_write() really serves two | ||
60 | * different purposes and should be broken up into two different functions. When | ||
61 | * passing now = false, it merely indicates that the node is now dirty - calling | ||
62 | * it ensures that the dirty keys will be written at some point in the future. | ||
63 | * | ||
64 | * When passing now = true, bch_btree_write() causes a write to happen | ||
65 | * "immediately" (if there was already a write in flight, it'll cause the write | ||
66 | * to happen as soon as the previous write completes). It returns immediately | ||
67 | * though - but it takes a refcount on the closure in struct btree_op you passed | ||
68 | * to it, so a closure_sync() later can be used to wait for the write to | ||
69 | * complete. | ||
70 | * | ||
71 | * This is handy because btree_split() and garbage collection can issue writes | ||
72 | * in parallel, reducing the amount of time they have to hold write locks. | ||
73 | * | ||
74 | * LOCKING: | ||
75 | * | ||
76 | * When traversing the btree, we may need write locks starting at some level - | ||
77 | * inserting a key into the btree will typically only require a write lock on | ||
78 | * the leaf node. | ||
79 | * | ||
80 | * This is specified with the lock field in struct btree_op; lock = 0 means we | ||
81 | * take write locks at level <= 0, i.e. only leaf nodes. bch_btree_node_get() | ||
82 | * checks this field and returns the node with the appropriate lock held. | ||
83 | * | ||
84 | * If, after traversing the btree, the insertion code discovers it has to split | ||
85 | * then it must restart from the root and take new locks - to do this it changes | ||
86 | * the lock field and returns -EINTR, which causes the btree_root() macro to | ||
87 | * loop. | ||
88 | * | ||
89 | * Handling cache misses require a different mechanism for upgrading to a write | ||
90 | * lock. We do cache lookups with only a read lock held, but if we get a cache | ||
91 | * miss and we wish to insert this data into the cache, we have to insert a | ||
92 | * placeholder key to detect races - otherwise, we could race with a write and | ||
93 | * overwrite the data that was just written to the cache with stale data from | ||
94 | * the backing device. | ||
95 | * | ||
96 | * For this we use a sequence number that write locks and unlocks increment - to | ||
97 | * insert the check key it unlocks the btree node and then takes a write lock, | ||
98 | * and fails if the sequence number doesn't match. | ||
99 | */ | ||
100 | |||
101 | #include "bset.h" | ||
102 | #include "debug.h" | ||
103 | |||
104 | struct btree_write { | ||
105 | struct closure *owner; | ||
106 | atomic_t *journal; | ||
107 | |||
108 | /* If btree_split() frees a btree node, it writes a new pointer to that | ||
109 | * btree node indicating it was freed; it takes a refcount on | ||
110 | * c->prio_blocked because we can't write the gens until the new | ||
111 | * pointer is on disk. This allows btree_write_endio() to release the | ||
112 | * refcount that btree_split() took. | ||
113 | */ | ||
114 | int prio_blocked; | ||
115 | }; | ||
116 | |||
117 | struct btree { | ||
118 | /* Hottest entries first */ | ||
119 | struct hlist_node hash; | ||
120 | |||
121 | /* Key/pointer for this btree node */ | ||
122 | BKEY_PADDED(key); | ||
123 | |||
124 | /* Single bit - set when accessed, cleared by shrinker */ | ||
125 | unsigned long accessed; | ||
126 | unsigned long seq; | ||
127 | struct rw_semaphore lock; | ||
128 | struct cache_set *c; | ||
129 | |||
130 | unsigned long flags; | ||
131 | uint16_t written; /* would be nice to kill */ | ||
132 | uint8_t level; | ||
133 | uint8_t nsets; | ||
134 | uint8_t page_order; | ||
135 | |||
136 | /* | ||
137 | * Set of sorted keys - the real btree node - plus a binary search tree | ||
138 | * | ||
139 | * sets[0] is special; set[0]->tree, set[0]->prev and set[0]->data point | ||
140 | * to the memory we have allocated for this btree node. Additionally, | ||
141 | * set[0]->data points to the entire btree node as it exists on disk. | ||
142 | */ | ||
143 | struct bset_tree sets[MAX_BSETS]; | ||
144 | |||
145 | /* Used to refcount bio splits, also protects b->bio */ | ||
146 | struct closure_with_waitlist io; | ||
147 | |||
148 | /* Gets transferred to w->prio_blocked - see the comment there */ | ||
149 | int prio_blocked; | ||
150 | |||
151 | struct list_head list; | ||
152 | struct delayed_work work; | ||
153 | |||
154 | uint64_t io_start_time; | ||
155 | struct btree_write writes[2]; | ||
156 | struct bio *bio; | ||
157 | }; | ||
158 | |||
159 | #define BTREE_FLAG(flag) \ | ||
160 | static inline bool btree_node_ ## flag(struct btree *b) \ | ||
161 | { return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ | ||
162 | \ | ||
163 | static inline void set_btree_node_ ## flag(struct btree *b) \ | ||
164 | { set_bit(BTREE_NODE_ ## flag, &b->flags); } \ | ||
165 | |||
166 | enum btree_flags { | ||
167 | BTREE_NODE_read_done, | ||
168 | BTREE_NODE_io_error, | ||
169 | BTREE_NODE_dirty, | ||
170 | BTREE_NODE_write_idx, | ||
171 | }; | ||
172 | |||
173 | BTREE_FLAG(read_done); | ||
174 | BTREE_FLAG(io_error); | ||
175 | BTREE_FLAG(dirty); | ||
176 | BTREE_FLAG(write_idx); | ||
177 | |||
178 | static inline struct btree_write *btree_current_write(struct btree *b) | ||
179 | { | ||
180 | return b->writes + btree_node_write_idx(b); | ||
181 | } | ||
182 | |||
183 | static inline struct btree_write *btree_prev_write(struct btree *b) | ||
184 | { | ||
185 | return b->writes + (btree_node_write_idx(b) ^ 1); | ||
186 | } | ||
187 | |||
188 | static inline unsigned bset_offset(struct btree *b, struct bset *i) | ||
189 | { | ||
190 | return (((size_t) i) - ((size_t) b->sets->data)) >> 9; | ||
191 | } | ||
192 | |||
193 | static inline struct bset *write_block(struct btree *b) | ||
194 | { | ||
195 | return ((void *) b->sets[0].data) + b->written * block_bytes(b->c); | ||
196 | } | ||
197 | |||
198 | static inline bool bset_written(struct btree *b, struct bset_tree *t) | ||
199 | { | ||
200 | return t->data < write_block(b); | ||
201 | } | ||
202 | |||
203 | static inline bool bkey_written(struct btree *b, struct bkey *k) | ||
204 | { | ||
205 | return k < write_block(b)->start; | ||
206 | } | ||
207 | |||
208 | static inline void set_gc_sectors(struct cache_set *c) | ||
209 | { | ||
210 | atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 8); | ||
211 | } | ||
212 | |||
213 | static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k) | ||
214 | { | ||
215 | return __bch_ptr_invalid(b->c, b->level, k); | ||
216 | } | ||
217 | |||
218 | static inline struct bkey *bch_btree_iter_init(struct btree *b, | ||
219 | struct btree_iter *iter, | ||
220 | struct bkey *search) | ||
221 | { | ||
222 | return __bch_btree_iter_init(b, iter, search, b->sets); | ||
223 | } | ||
224 | |||
225 | /* Looping macros */ | ||
226 | |||
227 | #define for_each_cached_btree(b, c, iter) \ | ||
228 | for (iter = 0; \ | ||
229 | iter < ARRAY_SIZE((c)->bucket_hash); \ | ||
230 | iter++) \ | ||
231 | hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash) | ||
232 | |||
233 | #define for_each_key_filter(b, k, iter, filter) \ | ||
234 | for (bch_btree_iter_init((b), (iter), NULL); \ | ||
235 | ((k) = bch_btree_iter_next_filter((iter), b, filter));) | ||
236 | |||
237 | #define for_each_key(b, k, iter) \ | ||
238 | for (bch_btree_iter_init((b), (iter), NULL); \ | ||
239 | ((k) = bch_btree_iter_next(iter));) | ||
240 | |||
241 | /* Recursing down the btree */ | ||
242 | |||
243 | struct btree_op { | ||
244 | struct closure cl; | ||
245 | struct cache_set *c; | ||
246 | |||
247 | /* Journal entry we have a refcount on */ | ||
248 | atomic_t *journal; | ||
249 | |||
250 | /* Bio to be inserted into the cache */ | ||
251 | struct bio *cache_bio; | ||
252 | |||
253 | unsigned inode; | ||
254 | |||
255 | uint16_t write_prio; | ||
256 | |||
257 | /* Btree level at which we start taking write locks */ | ||
258 | short lock; | ||
259 | |||
260 | /* Btree insertion type */ | ||
261 | enum { | ||
262 | BTREE_INSERT, | ||
263 | BTREE_REPLACE | ||
264 | } type:8; | ||
265 | |||
266 | unsigned csum:1; | ||
267 | unsigned skip:1; | ||
268 | unsigned flush_journal:1; | ||
269 | |||
270 | unsigned insert_data_done:1; | ||
271 | unsigned lookup_done:1; | ||
272 | unsigned insert_collision:1; | ||
273 | |||
274 | /* Anything after this point won't get zeroed in do_bio_hook() */ | ||
275 | |||
276 | /* Keys to be inserted */ | ||
277 | struct keylist keys; | ||
278 | BKEY_PADDED(replace); | ||
279 | }; | ||
280 | |||
281 | void bch_btree_op_init_stack(struct btree_op *); | ||
282 | |||
283 | static inline void rw_lock(bool w, struct btree *b, int level) | ||
284 | { | ||
285 | w ? down_write_nested(&b->lock, level + 1) | ||
286 | : down_read_nested(&b->lock, level + 1); | ||
287 | if (w) | ||
288 | b->seq++; | ||
289 | } | ||
290 | |||
291 | static inline void rw_unlock(bool w, struct btree *b) | ||
292 | { | ||
293 | #ifdef CONFIG_BCACHE_EDEBUG | ||
294 | unsigned i; | ||
295 | |||
296 | if (w && | ||
297 | b->key.ptr[0] && | ||
298 | btree_node_read_done(b)) | ||
299 | for (i = 0; i <= b->nsets; i++) | ||
300 | bch_check_key_order(b, b->sets[i].data); | ||
301 | #endif | ||
302 | |||
303 | if (w) | ||
304 | b->seq++; | ||
305 | (w ? up_write : up_read)(&b->lock); | ||
306 | } | ||
307 | |||
308 | #define insert_lock(s, b) ((b)->level <= (s)->lock) | ||
309 | |||
310 | /* | ||
311 | * These macros are for recursing down the btree - they handle the details of | ||
312 | * locking and looking up nodes in the cache for you. They're best treated as | ||
313 | * mere syntax when reading code that uses them. | ||
314 | * | ||
315 | * op->lock determines whether we take a read or a write lock at a given depth. | ||
316 | * If you've got a read lock and find that you need a write lock (i.e. you're | ||
317 | * going to have to split), set op->lock and return -EINTR; btree_root() will | ||
318 | * call you again and you'll have the correct lock. | ||
319 | */ | ||
320 | |||
321 | /** | ||
322 | * btree - recurse down the btree on a specified key | ||
323 | * @fn: function to call, which will be passed the child node | ||
324 | * @key: key to recurse on | ||
325 | * @b: parent btree node | ||
326 | * @op: pointer to struct btree_op | ||
327 | */ | ||
328 | #define btree(fn, key, b, op, ...) \ | ||
329 | ({ \ | ||
330 | int _r, l = (b)->level - 1; \ | ||
331 | bool _w = l <= (op)->lock; \ | ||
332 | struct btree *_b = bch_btree_node_get((b)->c, key, l, op); \ | ||
333 | if (!IS_ERR(_b)) { \ | ||
334 | _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ | ||
335 | rw_unlock(_w, _b); \ | ||
336 | } else \ | ||
337 | _r = PTR_ERR(_b); \ | ||
338 | _r; \ | ||
339 | }) | ||
340 | |||
341 | /** | ||
342 | * btree_root - call a function on the root of the btree | ||
343 | * @fn: function to call, which will be passed the child node | ||
344 | * @c: cache set | ||
345 | * @op: pointer to struct btree_op | ||
346 | */ | ||
347 | #define btree_root(fn, c, op, ...) \ | ||
348 | ({ \ | ||
349 | int _r = -EINTR; \ | ||
350 | do { \ | ||
351 | struct btree *_b = (c)->root; \ | ||
352 | bool _w = insert_lock(op, _b); \ | ||
353 | rw_lock(_w, _b, _b->level); \ | ||
354 | if (_b == (c)->root && \ | ||
355 | _w == insert_lock(op, _b)) \ | ||
356 | _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ | ||
357 | rw_unlock(_w, _b); \ | ||
358 | bch_cannibalize_unlock(c, &(op)->cl); \ | ||
359 | } while (_r == -EINTR); \ | ||
360 | \ | ||
361 | _r; \ | ||
362 | }) | ||
363 | |||
364 | static inline bool should_split(struct btree *b) | ||
365 | { | ||
366 | struct bset *i = write_block(b); | ||
367 | return b->written >= btree_blocks(b) || | ||
368 | (i->seq == b->sets[0].data->seq && | ||
369 | b->written + __set_blocks(i, i->keys + 15, b->c) | ||
370 | > btree_blocks(b)); | ||
371 | } | ||
372 | |||
373 | void bch_btree_read_done(struct closure *); | ||
374 | void bch_btree_read(struct btree *); | ||
375 | void bch_btree_write(struct btree *b, bool now, struct btree_op *op); | ||
376 | |||
377 | void bch_cannibalize_unlock(struct cache_set *, struct closure *); | ||
378 | void bch_btree_set_root(struct btree *); | ||
379 | struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *); | ||
380 | struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, | ||
381 | int, struct btree_op *); | ||
382 | |||
383 | bool bch_btree_insert_keys(struct btree *, struct btree_op *); | ||
384 | bool bch_btree_insert_check_key(struct btree *, struct btree_op *, | ||
385 | struct bio *); | ||
386 | int bch_btree_insert(struct btree_op *, struct cache_set *); | ||
387 | |||
388 | int bch_btree_search_recurse(struct btree *, struct btree_op *); | ||
389 | |||
390 | void bch_queue_gc(struct cache_set *); | ||
391 | size_t bch_btree_gc_finish(struct cache_set *); | ||
392 | void bch_moving_gc(struct closure *); | ||
393 | int bch_btree_check(struct cache_set *, struct btree_op *); | ||
394 | uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); | ||
395 | |||
396 | void bch_keybuf_init(struct keybuf *, keybuf_pred_fn *); | ||
397 | void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *); | ||
398 | bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *, | ||
399 | struct bkey *); | ||
400 | void bch_keybuf_del(struct keybuf *, struct keybuf_key *); | ||
401 | struct keybuf_key *bch_keybuf_next(struct keybuf *); | ||
402 | struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, | ||
403 | struct keybuf *, struct bkey *); | ||
404 | |||
405 | #endif | ||