diff options
author | Joe Thornber <thornber@redhat.com> | 2011-10-31 16:19:11 -0400 |
---|---|---|
committer | Alasdair G Kergon <agk@redhat.com> | 2011-10-31 16:19:11 -0400 |
commit | 3241b1d3e0aaafbfcd320f4d71ade629728cc4f4 (patch) | |
tree | 499461f724d4db3d7118641f4a20f5be23549edd /drivers/md/persistent-data/dm-btree.c | |
parent | 95d402f057f2e208e4631893f6cd4a59c7c05e41 (diff) |
dm: add persistent data library
The persistent-data library offers a re-usable framework for the storage
and management of on-disk metadata in device-mapper targets.
It's used by the thin-provisioning target in the next patch and in an
upcoming hierarchical storage target.
For further information, please read
Documentation/device-mapper/persistent-data.txt
Signed-off-by: Joe Thornber <thornber@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Diffstat (limited to 'drivers/md/persistent-data/dm-btree.c')
-rw-r--r-- | drivers/md/persistent-data/dm-btree.c | 805 |
1 files changed, 805 insertions, 0 deletions
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c new file mode 100644 index 000000000000..e0638be53ea4 --- /dev/null +++ b/drivers/md/persistent-data/dm-btree.c | |||
@@ -0,0 +1,805 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2011 Red Hat, Inc. | ||
3 | * | ||
4 | * This file is released under the GPL. | ||
5 | */ | ||
6 | |||
7 | #include "dm-btree-internal.h" | ||
8 | #include "dm-space-map.h" | ||
9 | #include "dm-transaction-manager.h" | ||
10 | |||
11 | #include <linux/module.h> | ||
12 | #include <linux/device-mapper.h> | ||
13 | |||
14 | #define DM_MSG_PREFIX "btree" | ||
15 | |||
16 | /*---------------------------------------------------------------- | ||
17 | * Array manipulation | ||
18 | *--------------------------------------------------------------*/ | ||
19 | static void memcpy_disk(void *dest, const void *src, size_t len) | ||
20 | __dm_written_to_disk(src) | ||
21 | { | ||
22 | memcpy(dest, src, len); | ||
23 | __dm_unbless_for_disk(src); | ||
24 | } | ||
25 | |||
26 | static void array_insert(void *base, size_t elt_size, unsigned nr_elts, | ||
27 | unsigned index, void *elt) | ||
28 | __dm_written_to_disk(elt) | ||
29 | { | ||
30 | if (index < nr_elts) | ||
31 | memmove(base + (elt_size * (index + 1)), | ||
32 | base + (elt_size * index), | ||
33 | (nr_elts - index) * elt_size); | ||
34 | |||
35 | memcpy_disk(base + (elt_size * index), elt, elt_size); | ||
36 | } | ||
37 | |||
38 | /*----------------------------------------------------------------*/ | ||
39 | |||
40 | /* makes the assumption that no two keys are the same. */ | ||
41 | static int bsearch(struct node *n, uint64_t key, int want_hi) | ||
42 | { | ||
43 | int lo = -1, hi = le32_to_cpu(n->header.nr_entries); | ||
44 | |||
45 | while (hi - lo > 1) { | ||
46 | int mid = lo + ((hi - lo) / 2); | ||
47 | uint64_t mid_key = le64_to_cpu(n->keys[mid]); | ||
48 | |||
49 | if (mid_key == key) | ||
50 | return mid; | ||
51 | |||
52 | if (mid_key < key) | ||
53 | lo = mid; | ||
54 | else | ||
55 | hi = mid; | ||
56 | } | ||
57 | |||
58 | return want_hi ? hi : lo; | ||
59 | } | ||
60 | |||
61 | int lower_bound(struct node *n, uint64_t key) | ||
62 | { | ||
63 | return bsearch(n, key, 0); | ||
64 | } | ||
65 | |||
66 | void inc_children(struct dm_transaction_manager *tm, struct node *n, | ||
67 | struct dm_btree_value_type *vt) | ||
68 | { | ||
69 | unsigned i; | ||
70 | uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); | ||
71 | |||
72 | if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) | ||
73 | for (i = 0; i < nr_entries; i++) | ||
74 | dm_tm_inc(tm, value64(n, i)); | ||
75 | else if (vt->inc) | ||
76 | for (i = 0; i < nr_entries; i++) | ||
77 | vt->inc(vt->context, | ||
78 | value_ptr(n, i, vt->size)); | ||
79 | } | ||
80 | |||
81 | static int insert_at(size_t value_size, struct node *node, unsigned index, | ||
82 | uint64_t key, void *value) | ||
83 | __dm_written_to_disk(value) | ||
84 | { | ||
85 | uint32_t nr_entries = le32_to_cpu(node->header.nr_entries); | ||
86 | __le64 key_le = cpu_to_le64(key); | ||
87 | |||
88 | if (index > nr_entries || | ||
89 | index >= le32_to_cpu(node->header.max_entries)) { | ||
90 | DMERR("too many entries in btree node for insert"); | ||
91 | __dm_unbless_for_disk(value); | ||
92 | return -ENOMEM; | ||
93 | } | ||
94 | |||
95 | __dm_bless_for_disk(&key_le); | ||
96 | |||
97 | array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le); | ||
98 | array_insert(value_base(node), value_size, nr_entries, index, value); | ||
99 | node->header.nr_entries = cpu_to_le32(nr_entries + 1); | ||
100 | |||
101 | return 0; | ||
102 | } | ||
103 | |||
104 | /*----------------------------------------------------------------*/ | ||
105 | |||
106 | /* | ||
107 | * We want 3n entries (for some n). This works more nicely for repeated | ||
108 | * insert remove loops than (2n + 1). | ||
109 | */ | ||
110 | static uint32_t calc_max_entries(size_t value_size, size_t block_size) | ||
111 | { | ||
112 | uint32_t total, n; | ||
113 | size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */ | ||
114 | |||
115 | block_size -= sizeof(struct node_header); | ||
116 | total = block_size / elt_size; | ||
117 | n = total / 3; /* rounds down */ | ||
118 | |||
119 | return 3 * n; | ||
120 | } | ||
121 | |||
122 | int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root) | ||
123 | { | ||
124 | int r; | ||
125 | struct dm_block *b; | ||
126 | struct node *n; | ||
127 | size_t block_size; | ||
128 | uint32_t max_entries; | ||
129 | |||
130 | r = new_block(info, &b); | ||
131 | if (r < 0) | ||
132 | return r; | ||
133 | |||
134 | block_size = dm_bm_block_size(dm_tm_get_bm(info->tm)); | ||
135 | max_entries = calc_max_entries(info->value_type.size, block_size); | ||
136 | |||
137 | n = dm_block_data(b); | ||
138 | memset(n, 0, block_size); | ||
139 | n->header.flags = cpu_to_le32(LEAF_NODE); | ||
140 | n->header.nr_entries = cpu_to_le32(0); | ||
141 | n->header.max_entries = cpu_to_le32(max_entries); | ||
142 | n->header.value_size = cpu_to_le32(info->value_type.size); | ||
143 | |||
144 | *root = dm_block_location(b); | ||
145 | return unlock_block(info, b); | ||
146 | } | ||
147 | EXPORT_SYMBOL_GPL(dm_btree_empty); | ||
148 | |||
149 | /*----------------------------------------------------------------*/ | ||
150 | |||
151 | /* | ||
152 | * Deletion uses a recursive algorithm, since we have limited stack space | ||
153 | * we explicitly manage our own stack on the heap. | ||
154 | */ | ||
155 | #define MAX_SPINE_DEPTH 64 | ||
156 | struct frame { | ||
157 | struct dm_block *b; | ||
158 | struct node *n; | ||
159 | unsigned level; | ||
160 | unsigned nr_children; | ||
161 | unsigned current_child; | ||
162 | }; | ||
163 | |||
164 | struct del_stack { | ||
165 | struct dm_transaction_manager *tm; | ||
166 | int top; | ||
167 | struct frame spine[MAX_SPINE_DEPTH]; | ||
168 | }; | ||
169 | |||
170 | static int top_frame(struct del_stack *s, struct frame **f) | ||
171 | { | ||
172 | if (s->top < 0) { | ||
173 | DMERR("btree deletion stack empty"); | ||
174 | return -EINVAL; | ||
175 | } | ||
176 | |||
177 | *f = s->spine + s->top; | ||
178 | |||
179 | return 0; | ||
180 | } | ||
181 | |||
182 | static int unprocessed_frames(struct del_stack *s) | ||
183 | { | ||
184 | return s->top >= 0; | ||
185 | } | ||
186 | |||
187 | static int push_frame(struct del_stack *s, dm_block_t b, unsigned level) | ||
188 | { | ||
189 | int r; | ||
190 | uint32_t ref_count; | ||
191 | |||
192 | if (s->top >= MAX_SPINE_DEPTH - 1) { | ||
193 | DMERR("btree deletion stack out of memory"); | ||
194 | return -ENOMEM; | ||
195 | } | ||
196 | |||
197 | r = dm_tm_ref(s->tm, b, &ref_count); | ||
198 | if (r) | ||
199 | return r; | ||
200 | |||
201 | if (ref_count > 1) | ||
202 | /* | ||
203 | * This is a shared node, so we can just decrement it's | ||
204 | * reference counter and leave the children. | ||
205 | */ | ||
206 | dm_tm_dec(s->tm, b); | ||
207 | |||
208 | else { | ||
209 | struct frame *f = s->spine + ++s->top; | ||
210 | |||
211 | r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b); | ||
212 | if (r) { | ||
213 | s->top--; | ||
214 | return r; | ||
215 | } | ||
216 | |||
217 | f->n = dm_block_data(f->b); | ||
218 | f->level = level; | ||
219 | f->nr_children = le32_to_cpu(f->n->header.nr_entries); | ||
220 | f->current_child = 0; | ||
221 | } | ||
222 | |||
223 | return 0; | ||
224 | } | ||
225 | |||
226 | static void pop_frame(struct del_stack *s) | ||
227 | { | ||
228 | struct frame *f = s->spine + s->top--; | ||
229 | |||
230 | dm_tm_dec(s->tm, dm_block_location(f->b)); | ||
231 | dm_tm_unlock(s->tm, f->b); | ||
232 | } | ||
233 | |||
234 | int dm_btree_del(struct dm_btree_info *info, dm_block_t root) | ||
235 | { | ||
236 | int r; | ||
237 | struct del_stack *s; | ||
238 | |||
239 | s = kmalloc(sizeof(*s), GFP_KERNEL); | ||
240 | if (!s) | ||
241 | return -ENOMEM; | ||
242 | s->tm = info->tm; | ||
243 | s->top = -1; | ||
244 | |||
245 | r = push_frame(s, root, 1); | ||
246 | if (r) | ||
247 | goto out; | ||
248 | |||
249 | while (unprocessed_frames(s)) { | ||
250 | uint32_t flags; | ||
251 | struct frame *f; | ||
252 | dm_block_t b; | ||
253 | |||
254 | r = top_frame(s, &f); | ||
255 | if (r) | ||
256 | goto out; | ||
257 | |||
258 | if (f->current_child >= f->nr_children) { | ||
259 | pop_frame(s); | ||
260 | continue; | ||
261 | } | ||
262 | |||
263 | flags = le32_to_cpu(f->n->header.flags); | ||
264 | if (flags & INTERNAL_NODE) { | ||
265 | b = value64(f->n, f->current_child); | ||
266 | f->current_child++; | ||
267 | r = push_frame(s, b, f->level); | ||
268 | if (r) | ||
269 | goto out; | ||
270 | |||
271 | } else if (f->level != (info->levels - 1)) { | ||
272 | b = value64(f->n, f->current_child); | ||
273 | f->current_child++; | ||
274 | r = push_frame(s, b, f->level + 1); | ||
275 | if (r) | ||
276 | goto out; | ||
277 | |||
278 | } else { | ||
279 | if (info->value_type.dec) { | ||
280 | unsigned i; | ||
281 | |||
282 | for (i = 0; i < f->nr_children; i++) | ||
283 | info->value_type.dec(info->value_type.context, | ||
284 | value_ptr(f->n, i, info->value_type.size)); | ||
285 | } | ||
286 | f->current_child = f->nr_children; | ||
287 | } | ||
288 | } | ||
289 | |||
290 | out: | ||
291 | kfree(s); | ||
292 | return r; | ||
293 | } | ||
294 | EXPORT_SYMBOL_GPL(dm_btree_del); | ||
295 | |||
296 | /*----------------------------------------------------------------*/ | ||
297 | |||
298 | static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key, | ||
299 | int (*search_fn)(struct node *, uint64_t), | ||
300 | uint64_t *result_key, void *v, size_t value_size) | ||
301 | { | ||
302 | int i, r; | ||
303 | uint32_t flags, nr_entries; | ||
304 | |||
305 | do { | ||
306 | r = ro_step(s, block); | ||
307 | if (r < 0) | ||
308 | return r; | ||
309 | |||
310 | i = search_fn(ro_node(s), key); | ||
311 | |||
312 | flags = le32_to_cpu(ro_node(s)->header.flags); | ||
313 | nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries); | ||
314 | if (i < 0 || i >= nr_entries) | ||
315 | return -ENODATA; | ||
316 | |||
317 | if (flags & INTERNAL_NODE) | ||
318 | block = value64(ro_node(s), i); | ||
319 | |||
320 | } while (!(flags & LEAF_NODE)); | ||
321 | |||
322 | *result_key = le64_to_cpu(ro_node(s)->keys[i]); | ||
323 | memcpy(v, value_ptr(ro_node(s), i, value_size), value_size); | ||
324 | |||
325 | return 0; | ||
326 | } | ||
327 | |||
328 | int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root, | ||
329 | uint64_t *keys, void *value_le) | ||
330 | { | ||
331 | unsigned level, last_level = info->levels - 1; | ||
332 | int r = -ENODATA; | ||
333 | uint64_t rkey; | ||
334 | __le64 internal_value_le; | ||
335 | struct ro_spine spine; | ||
336 | |||
337 | init_ro_spine(&spine, info); | ||
338 | for (level = 0; level < info->levels; level++) { | ||
339 | size_t size; | ||
340 | void *value_p; | ||
341 | |||
342 | if (level == last_level) { | ||
343 | value_p = value_le; | ||
344 | size = info->value_type.size; | ||
345 | |||
346 | } else { | ||
347 | value_p = &internal_value_le; | ||
348 | size = sizeof(uint64_t); | ||
349 | } | ||
350 | |||
351 | r = btree_lookup_raw(&spine, root, keys[level], | ||
352 | lower_bound, &rkey, | ||
353 | value_p, size); | ||
354 | |||
355 | if (!r) { | ||
356 | if (rkey != keys[level]) { | ||
357 | exit_ro_spine(&spine); | ||
358 | return -ENODATA; | ||
359 | } | ||
360 | } else { | ||
361 | exit_ro_spine(&spine); | ||
362 | return r; | ||
363 | } | ||
364 | |||
365 | root = le64_to_cpu(internal_value_le); | ||
366 | } | ||
367 | exit_ro_spine(&spine); | ||
368 | |||
369 | return r; | ||
370 | } | ||
371 | EXPORT_SYMBOL_GPL(dm_btree_lookup); | ||
372 | |||
373 | /* | ||
374 | * Splits a node by creating a sibling node and shifting half the nodes | ||
375 | * contents across. Assumes there is a parent node, and it has room for | ||
376 | * another child. | ||
377 | * | ||
378 | * Before: | ||
379 | * +--------+ | ||
380 | * | Parent | | ||
381 | * +--------+ | ||
382 | * | | ||
383 | * v | ||
384 | * +----------+ | ||
385 | * | A ++++++ | | ||
386 | * +----------+ | ||
387 | * | ||
388 | * | ||
389 | * After: | ||
390 | * +--------+ | ||
391 | * | Parent | | ||
392 | * +--------+ | ||
393 | * | | | ||
394 | * v +------+ | ||
395 | * +---------+ | | ||
396 | * | A* +++ | v | ||
397 | * +---------+ +-------+ | ||
398 | * | B +++ | | ||
399 | * +-------+ | ||
400 | * | ||
401 | * Where A* is a shadow of A. | ||
402 | */ | ||
403 | static int btree_split_sibling(struct shadow_spine *s, dm_block_t root, | ||
404 | unsigned parent_index, uint64_t key) | ||
405 | { | ||
406 | int r; | ||
407 | size_t size; | ||
408 | unsigned nr_left, nr_right; | ||
409 | struct dm_block *left, *right, *parent; | ||
410 | struct node *ln, *rn, *pn; | ||
411 | __le64 location; | ||
412 | |||
413 | left = shadow_current(s); | ||
414 | |||
415 | r = new_block(s->info, &right); | ||
416 | if (r < 0) | ||
417 | return r; | ||
418 | |||
419 | ln = dm_block_data(left); | ||
420 | rn = dm_block_data(right); | ||
421 | |||
422 | nr_left = le32_to_cpu(ln->header.nr_entries) / 2; | ||
423 | nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left; | ||
424 | |||
425 | ln->header.nr_entries = cpu_to_le32(nr_left); | ||
426 | |||
427 | rn->header.flags = ln->header.flags; | ||
428 | rn->header.nr_entries = cpu_to_le32(nr_right); | ||
429 | rn->header.max_entries = ln->header.max_entries; | ||
430 | rn->header.value_size = ln->header.value_size; | ||
431 | memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0])); | ||
432 | |||
433 | size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ? | ||
434 | sizeof(uint64_t) : s->info->value_type.size; | ||
435 | memcpy(value_ptr(rn, 0, size), value_ptr(ln, nr_left, size), | ||
436 | size * nr_right); | ||
437 | |||
438 | /* | ||
439 | * Patch up the parent | ||
440 | */ | ||
441 | parent = shadow_parent(s); | ||
442 | |||
443 | pn = dm_block_data(parent); | ||
444 | location = cpu_to_le64(dm_block_location(left)); | ||
445 | __dm_bless_for_disk(&location); | ||
446 | memcpy_disk(value_ptr(pn, parent_index, sizeof(__le64)), | ||
447 | &location, sizeof(__le64)); | ||
448 | |||
449 | location = cpu_to_le64(dm_block_location(right)); | ||
450 | __dm_bless_for_disk(&location); | ||
451 | |||
452 | r = insert_at(sizeof(__le64), pn, parent_index + 1, | ||
453 | le64_to_cpu(rn->keys[0]), &location); | ||
454 | if (r) | ||
455 | return r; | ||
456 | |||
457 | if (key < le64_to_cpu(rn->keys[0])) { | ||
458 | unlock_block(s->info, right); | ||
459 | s->nodes[1] = left; | ||
460 | } else { | ||
461 | unlock_block(s->info, left); | ||
462 | s->nodes[1] = right; | ||
463 | } | ||
464 | |||
465 | return 0; | ||
466 | } | ||
467 | |||
468 | /* | ||
469 | * Splits a node by creating two new children beneath the given node. | ||
470 | * | ||
471 | * Before: | ||
472 | * +----------+ | ||
473 | * | A ++++++ | | ||
474 | * +----------+ | ||
475 | * | ||
476 | * | ||
477 | * After: | ||
478 | * +------------+ | ||
479 | * | A (shadow) | | ||
480 | * +------------+ | ||
481 | * | | | ||
482 | * +------+ +----+ | ||
483 | * | | | ||
484 | * v v | ||
485 | * +-------+ +-------+ | ||
486 | * | B +++ | | C +++ | | ||
487 | * +-------+ +-------+ | ||
488 | */ | ||
489 | static int btree_split_beneath(struct shadow_spine *s, uint64_t key) | ||
490 | { | ||
491 | int r; | ||
492 | size_t size; | ||
493 | unsigned nr_left, nr_right; | ||
494 | struct dm_block *left, *right, *new_parent; | ||
495 | struct node *pn, *ln, *rn; | ||
496 | __le64 val; | ||
497 | |||
498 | new_parent = shadow_current(s); | ||
499 | |||
500 | r = new_block(s->info, &left); | ||
501 | if (r < 0) | ||
502 | return r; | ||
503 | |||
504 | r = new_block(s->info, &right); | ||
505 | if (r < 0) { | ||
506 | /* FIXME: put left */ | ||
507 | return r; | ||
508 | } | ||
509 | |||
510 | pn = dm_block_data(new_parent); | ||
511 | ln = dm_block_data(left); | ||
512 | rn = dm_block_data(right); | ||
513 | |||
514 | nr_left = le32_to_cpu(pn->header.nr_entries) / 2; | ||
515 | nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left; | ||
516 | |||
517 | ln->header.flags = pn->header.flags; | ||
518 | ln->header.nr_entries = cpu_to_le32(nr_left); | ||
519 | ln->header.max_entries = pn->header.max_entries; | ||
520 | ln->header.value_size = pn->header.value_size; | ||
521 | |||
522 | rn->header.flags = pn->header.flags; | ||
523 | rn->header.nr_entries = cpu_to_le32(nr_right); | ||
524 | rn->header.max_entries = pn->header.max_entries; | ||
525 | rn->header.value_size = pn->header.value_size; | ||
526 | |||
527 | memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0])); | ||
528 | memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0])); | ||
529 | |||
530 | size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ? | ||
531 | sizeof(__le64) : s->info->value_type.size; | ||
532 | memcpy(value_ptr(ln, 0, size), value_ptr(pn, 0, size), nr_left * size); | ||
533 | memcpy(value_ptr(rn, 0, size), value_ptr(pn, nr_left, size), | ||
534 | nr_right * size); | ||
535 | |||
536 | /* new_parent should just point to l and r now */ | ||
537 | pn->header.flags = cpu_to_le32(INTERNAL_NODE); | ||
538 | pn->header.nr_entries = cpu_to_le32(2); | ||
539 | pn->header.max_entries = cpu_to_le32( | ||
540 | calc_max_entries(sizeof(__le64), | ||
541 | dm_bm_block_size( | ||
542 | dm_tm_get_bm(s->info->tm)))); | ||
543 | pn->header.value_size = cpu_to_le32(sizeof(__le64)); | ||
544 | |||
545 | val = cpu_to_le64(dm_block_location(left)); | ||
546 | __dm_bless_for_disk(&val); | ||
547 | pn->keys[0] = ln->keys[0]; | ||
548 | memcpy_disk(value_ptr(pn, 0, sizeof(__le64)), &val, sizeof(__le64)); | ||
549 | |||
550 | val = cpu_to_le64(dm_block_location(right)); | ||
551 | __dm_bless_for_disk(&val); | ||
552 | pn->keys[1] = rn->keys[0]; | ||
553 | memcpy_disk(value_ptr(pn, 1, sizeof(__le64)), &val, sizeof(__le64)); | ||
554 | |||
555 | /* | ||
556 | * rejig the spine. This is ugly, since it knows too | ||
557 | * much about the spine | ||
558 | */ | ||
559 | if (s->nodes[0] != new_parent) { | ||
560 | unlock_block(s->info, s->nodes[0]); | ||
561 | s->nodes[0] = new_parent; | ||
562 | } | ||
563 | if (key < le64_to_cpu(rn->keys[0])) { | ||
564 | unlock_block(s->info, right); | ||
565 | s->nodes[1] = left; | ||
566 | } else { | ||
567 | unlock_block(s->info, left); | ||
568 | s->nodes[1] = right; | ||
569 | } | ||
570 | s->count = 2; | ||
571 | |||
572 | return 0; | ||
573 | } | ||
574 | |||
575 | static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, | ||
576 | struct dm_btree_value_type *vt, | ||
577 | uint64_t key, unsigned *index) | ||
578 | { | ||
579 | int r, i = *index, top = 1; | ||
580 | struct node *node; | ||
581 | |||
582 | for (;;) { | ||
583 | r = shadow_step(s, root, vt); | ||
584 | if (r < 0) | ||
585 | return r; | ||
586 | |||
587 | node = dm_block_data(shadow_current(s)); | ||
588 | |||
589 | /* | ||
590 | * We have to patch up the parent node, ugly, but I don't | ||
591 | * see a way to do this automatically as part of the spine | ||
592 | * op. | ||
593 | */ | ||
594 | if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */ | ||
595 | __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); | ||
596 | |||
597 | __dm_bless_for_disk(&location); | ||
598 | memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(uint64_t)), | ||
599 | &location, sizeof(__le64)); | ||
600 | } | ||
601 | |||
602 | node = dm_block_data(shadow_current(s)); | ||
603 | |||
604 | if (node->header.nr_entries == node->header.max_entries) { | ||
605 | if (top) | ||
606 | r = btree_split_beneath(s, key); | ||
607 | else | ||
608 | r = btree_split_sibling(s, root, i, key); | ||
609 | |||
610 | if (r < 0) | ||
611 | return r; | ||
612 | } | ||
613 | |||
614 | node = dm_block_data(shadow_current(s)); | ||
615 | |||
616 | i = lower_bound(node, key); | ||
617 | |||
618 | if (le32_to_cpu(node->header.flags) & LEAF_NODE) | ||
619 | break; | ||
620 | |||
621 | if (i < 0) { | ||
622 | /* change the bounds on the lowest key */ | ||
623 | node->keys[0] = cpu_to_le64(key); | ||
624 | i = 0; | ||
625 | } | ||
626 | |||
627 | root = value64(node, i); | ||
628 | top = 0; | ||
629 | } | ||
630 | |||
631 | if (i < 0 || le64_to_cpu(node->keys[i]) != key) | ||
632 | i++; | ||
633 | |||
634 | *index = i; | ||
635 | return 0; | ||
636 | } | ||
637 | |||
638 | static int insert(struct dm_btree_info *info, dm_block_t root, | ||
639 | uint64_t *keys, void *value, dm_block_t *new_root, | ||
640 | int *inserted) | ||
641 | __dm_written_to_disk(value) | ||
642 | { | ||
643 | int r, need_insert; | ||
644 | unsigned level, index = -1, last_level = info->levels - 1; | ||
645 | dm_block_t block = root; | ||
646 | struct shadow_spine spine; | ||
647 | struct node *n; | ||
648 | struct dm_btree_value_type le64_type; | ||
649 | |||
650 | le64_type.context = NULL; | ||
651 | le64_type.size = sizeof(__le64); | ||
652 | le64_type.inc = NULL; | ||
653 | le64_type.dec = NULL; | ||
654 | le64_type.equal = NULL; | ||
655 | |||
656 | init_shadow_spine(&spine, info); | ||
657 | |||
658 | for (level = 0; level < (info->levels - 1); level++) { | ||
659 | r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index); | ||
660 | if (r < 0) | ||
661 | goto bad; | ||
662 | |||
663 | n = dm_block_data(shadow_current(&spine)); | ||
664 | need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) || | ||
665 | (le64_to_cpu(n->keys[index]) != keys[level])); | ||
666 | |||
667 | if (need_insert) { | ||
668 | dm_block_t new_tree; | ||
669 | __le64 new_le; | ||
670 | |||
671 | r = dm_btree_empty(info, &new_tree); | ||
672 | if (r < 0) | ||
673 | goto bad; | ||
674 | |||
675 | new_le = cpu_to_le64(new_tree); | ||
676 | __dm_bless_for_disk(&new_le); | ||
677 | |||
678 | r = insert_at(sizeof(uint64_t), n, index, | ||
679 | keys[level], &new_le); | ||
680 | if (r) | ||
681 | goto bad; | ||
682 | } | ||
683 | |||
684 | if (level < last_level) | ||
685 | block = value64(n, index); | ||
686 | } | ||
687 | |||
688 | r = btree_insert_raw(&spine, block, &info->value_type, | ||
689 | keys[level], &index); | ||
690 | if (r < 0) | ||
691 | goto bad; | ||
692 | |||
693 | n = dm_block_data(shadow_current(&spine)); | ||
694 | need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) || | ||
695 | (le64_to_cpu(n->keys[index]) != keys[level])); | ||
696 | |||
697 | if (need_insert) { | ||
698 | if (inserted) | ||
699 | *inserted = 1; | ||
700 | |||
701 | r = insert_at(info->value_type.size, n, index, | ||
702 | keys[level], value); | ||
703 | if (r) | ||
704 | goto bad_unblessed; | ||
705 | } else { | ||
706 | if (inserted) | ||
707 | *inserted = 0; | ||
708 | |||
709 | if (info->value_type.dec && | ||
710 | (!info->value_type.equal || | ||
711 | !info->value_type.equal( | ||
712 | info->value_type.context, | ||
713 | value_ptr(n, index, info->value_type.size), | ||
714 | value))) { | ||
715 | info->value_type.dec(info->value_type.context, | ||
716 | value_ptr(n, index, info->value_type.size)); | ||
717 | } | ||
718 | memcpy_disk(value_ptr(n, index, info->value_type.size), | ||
719 | value, info->value_type.size); | ||
720 | } | ||
721 | |||
722 | *new_root = shadow_root(&spine); | ||
723 | exit_shadow_spine(&spine); | ||
724 | |||
725 | return 0; | ||
726 | |||
727 | bad: | ||
728 | __dm_unbless_for_disk(value); | ||
729 | bad_unblessed: | ||
730 | exit_shadow_spine(&spine); | ||
731 | return r; | ||
732 | } | ||
733 | |||
734 | int dm_btree_insert(struct dm_btree_info *info, dm_block_t root, | ||
735 | uint64_t *keys, void *value, dm_block_t *new_root) | ||
736 | __dm_written_to_disk(value) | ||
737 | { | ||
738 | return insert(info, root, keys, value, new_root, NULL); | ||
739 | } | ||
740 | EXPORT_SYMBOL_GPL(dm_btree_insert); | ||
741 | |||
742 | int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root, | ||
743 | uint64_t *keys, void *value, dm_block_t *new_root, | ||
744 | int *inserted) | ||
745 | __dm_written_to_disk(value) | ||
746 | { | ||
747 | return insert(info, root, keys, value, new_root, inserted); | ||
748 | } | ||
749 | EXPORT_SYMBOL_GPL(dm_btree_insert_notify); | ||
750 | |||
751 | /*----------------------------------------------------------------*/ | ||
752 | |||
753 | static int find_highest_key(struct ro_spine *s, dm_block_t block, | ||
754 | uint64_t *result_key, dm_block_t *next_block) | ||
755 | { | ||
756 | int i, r; | ||
757 | uint32_t flags; | ||
758 | |||
759 | do { | ||
760 | r = ro_step(s, block); | ||
761 | if (r < 0) | ||
762 | return r; | ||
763 | |||
764 | flags = le32_to_cpu(ro_node(s)->header.flags); | ||
765 | i = le32_to_cpu(ro_node(s)->header.nr_entries); | ||
766 | if (!i) | ||
767 | return -ENODATA; | ||
768 | else | ||
769 | i--; | ||
770 | |||
771 | *result_key = le64_to_cpu(ro_node(s)->keys[i]); | ||
772 | if (next_block || flags & INTERNAL_NODE) | ||
773 | block = value64(ro_node(s), i); | ||
774 | |||
775 | } while (flags & INTERNAL_NODE); | ||
776 | |||
777 | if (next_block) | ||
778 | *next_block = block; | ||
779 | return 0; | ||
780 | } | ||
781 | |||
782 | int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, | ||
783 | uint64_t *result_keys) | ||
784 | { | ||
785 | int r = 0, count = 0, level; | ||
786 | struct ro_spine spine; | ||
787 | |||
788 | init_ro_spine(&spine, info); | ||
789 | for (level = 0; level < info->levels; level++) { | ||
790 | r = find_highest_key(&spine, root, result_keys + level, | ||
791 | level == info->levels - 1 ? NULL : &root); | ||
792 | if (r == -ENODATA) { | ||
793 | r = 0; | ||
794 | break; | ||
795 | |||
796 | } else if (r) | ||
797 | break; | ||
798 | |||
799 | count++; | ||
800 | } | ||
801 | exit_ro_spine(&spine); | ||
802 | |||
803 | return r ? r : count; | ||
804 | } | ||
805 | EXPORT_SYMBOL_GPL(dm_btree_find_highest_key); | ||