diff options
Diffstat (limited to 'drivers/md/persistent-data/dm-btree-remove.c')
-rw-r--r-- | drivers/md/persistent-data/dm-btree-remove.c | 566 |
1 files changed, 566 insertions, 0 deletions
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c new file mode 100644 index 000000000000..65fd85ec6514 --- /dev/null +++ b/drivers/md/persistent-data/dm-btree-remove.c | |||
@@ -0,0 +1,566 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2011 Red Hat, Inc. | ||
3 | * | ||
4 | * This file is released under the GPL. | ||
5 | */ | ||
6 | |||
7 | #include "dm-btree.h" | ||
8 | #include "dm-btree-internal.h" | ||
9 | #include "dm-transaction-manager.h" | ||
10 | |||
11 | #include <linux/module.h> | ||
12 | |||
13 | /* | ||
14 | * Removing an entry from a btree | ||
15 | * ============================== | ||
16 | * | ||
17 | * A very important constraint for our btree is that no node, except the | ||
18 | * root, may have fewer than a certain number of entries. | ||
19 | * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES). | ||
20 | * | ||
21 | * Ensuring this is complicated by the way we want to only ever hold the | ||
22 | * locks on 2 nodes concurrently, and only change nodes in a top to bottom | ||
23 | * fashion. | ||
24 | * | ||
25 | * Each node may have a left or right sibling. When decending the spine, | ||
26 | * if a node contains only MIN_ENTRIES then we try and increase this to at | ||
27 | * least MIN_ENTRIES + 1. We do this in the following ways: | ||
28 | * | ||
29 | * [A] No siblings => this can only happen if the node is the root, in which | ||
30 | * case we copy the childs contents over the root. | ||
31 | * | ||
32 | * [B] No left sibling | ||
33 | * ==> rebalance(node, right sibling) | ||
34 | * | ||
35 | * [C] No right sibling | ||
36 | * ==> rebalance(left sibling, node) | ||
37 | * | ||
38 | * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD | ||
39 | * ==> delete node adding it's contents to left and right | ||
40 | * | ||
41 | * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD | ||
42 | * ==> rebalance(left, node, right) | ||
43 | * | ||
44 | * After these operations it's possible that the our original node no | ||
45 | * longer contains the desired sub tree. For this reason this rebalancing | ||
46 | * is performed on the children of the current node. This also avoids | ||
47 | * having a special case for the root. | ||
48 | * | ||
49 | * Once this rebalancing has occurred we can then step into the child node | ||
50 | * for internal nodes. Or delete the entry for leaf nodes. | ||
51 | */ | ||
52 | |||
53 | /* | ||
54 | * Some little utilities for moving node data around. | ||
55 | */ | ||
56 | static void node_shift(struct node *n, int shift) | ||
57 | { | ||
58 | uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); | ||
59 | uint32_t value_size = le32_to_cpu(n->header.value_size); | ||
60 | |||
61 | if (shift < 0) { | ||
62 | shift = -shift; | ||
63 | BUG_ON(shift > nr_entries); | ||
64 | BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift, value_size)); | ||
65 | memmove(key_ptr(n, 0), | ||
66 | key_ptr(n, shift), | ||
67 | (nr_entries - shift) * sizeof(__le64)); | ||
68 | memmove(value_ptr(n, 0, value_size), | ||
69 | value_ptr(n, shift, value_size), | ||
70 | (nr_entries - shift) * value_size); | ||
71 | } else { | ||
72 | BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries)); | ||
73 | memmove(key_ptr(n, shift), | ||
74 | key_ptr(n, 0), | ||
75 | nr_entries * sizeof(__le64)); | ||
76 | memmove(value_ptr(n, shift, value_size), | ||
77 | value_ptr(n, 0, value_size), | ||
78 | nr_entries * value_size); | ||
79 | } | ||
80 | } | ||
81 | |||
82 | static void node_copy(struct node *left, struct node *right, int shift) | ||
83 | { | ||
84 | uint32_t nr_left = le32_to_cpu(left->header.nr_entries); | ||
85 | uint32_t value_size = le32_to_cpu(left->header.value_size); | ||
86 | BUG_ON(value_size != le32_to_cpu(right->header.value_size)); | ||
87 | |||
88 | if (shift < 0) { | ||
89 | shift = -shift; | ||
90 | BUG_ON(nr_left + shift > le32_to_cpu(left->header.max_entries)); | ||
91 | memcpy(key_ptr(left, nr_left), | ||
92 | key_ptr(right, 0), | ||
93 | shift * sizeof(__le64)); | ||
94 | memcpy(value_ptr(left, nr_left, value_size), | ||
95 | value_ptr(right, 0, value_size), | ||
96 | shift * value_size); | ||
97 | } else { | ||
98 | BUG_ON(shift > le32_to_cpu(right->header.max_entries)); | ||
99 | memcpy(key_ptr(right, 0), | ||
100 | key_ptr(left, nr_left - shift), | ||
101 | shift * sizeof(__le64)); | ||
102 | memcpy(value_ptr(right, 0, value_size), | ||
103 | value_ptr(left, nr_left - shift, value_size), | ||
104 | shift * value_size); | ||
105 | } | ||
106 | } | ||
107 | |||
108 | /* | ||
109 | * Delete a specific entry from a leaf node. | ||
110 | */ | ||
111 | static void delete_at(struct node *n, unsigned index) | ||
112 | { | ||
113 | unsigned nr_entries = le32_to_cpu(n->header.nr_entries); | ||
114 | unsigned nr_to_copy = nr_entries - (index + 1); | ||
115 | uint32_t value_size = le32_to_cpu(n->header.value_size); | ||
116 | BUG_ON(index >= nr_entries); | ||
117 | |||
118 | if (nr_to_copy) { | ||
119 | memmove(key_ptr(n, index), | ||
120 | key_ptr(n, index + 1), | ||
121 | nr_to_copy * sizeof(__le64)); | ||
122 | |||
123 | memmove(value_ptr(n, index, value_size), | ||
124 | value_ptr(n, index + 1, value_size), | ||
125 | nr_to_copy * value_size); | ||
126 | } | ||
127 | |||
128 | n->header.nr_entries = cpu_to_le32(nr_entries - 1); | ||
129 | } | ||
130 | |||
131 | static unsigned del_threshold(struct node *n) | ||
132 | { | ||
133 | return le32_to_cpu(n->header.max_entries) / 3; | ||
134 | } | ||
135 | |||
136 | static unsigned merge_threshold(struct node *n) | ||
137 | { | ||
138 | /* | ||
139 | * The extra one is because we know we're potentially going to | ||
140 | * delete an entry. | ||
141 | */ | ||
142 | return 2 * (le32_to_cpu(n->header.max_entries) / 3) + 1; | ||
143 | } | ||
144 | |||
145 | struct child { | ||
146 | unsigned index; | ||
147 | struct dm_block *block; | ||
148 | struct node *n; | ||
149 | }; | ||
150 | |||
151 | static struct dm_btree_value_type le64_type = { | ||
152 | .context = NULL, | ||
153 | .size = sizeof(__le64), | ||
154 | .inc = NULL, | ||
155 | .dec = NULL, | ||
156 | .equal = NULL | ||
157 | }; | ||
158 | |||
159 | static int init_child(struct dm_btree_info *info, struct node *parent, | ||
160 | unsigned index, struct child *result) | ||
161 | { | ||
162 | int r, inc; | ||
163 | dm_block_t root; | ||
164 | |||
165 | result->index = index; | ||
166 | root = value64(parent, index); | ||
167 | |||
168 | r = dm_tm_shadow_block(info->tm, root, &btree_node_validator, | ||
169 | &result->block, &inc); | ||
170 | if (r) | ||
171 | return r; | ||
172 | |||
173 | result->n = dm_block_data(result->block); | ||
174 | |||
175 | if (inc) | ||
176 | inc_children(info->tm, result->n, &le64_type); | ||
177 | |||
178 | *((__le64 *) value_ptr(parent, index, sizeof(__le64))) = | ||
179 | cpu_to_le64(dm_block_location(result->block)); | ||
180 | |||
181 | return 0; | ||
182 | } | ||
183 | |||
184 | static int exit_child(struct dm_btree_info *info, struct child *c) | ||
185 | { | ||
186 | return dm_tm_unlock(info->tm, c->block); | ||
187 | } | ||
188 | |||
189 | static void shift(struct node *left, struct node *right, int count) | ||
190 | { | ||
191 | if (!count) | ||
192 | return; | ||
193 | |||
194 | if (count > 0) { | ||
195 | node_shift(right, count); | ||
196 | node_copy(left, right, count); | ||
197 | } else { | ||
198 | node_copy(left, right, count); | ||
199 | node_shift(right, count); | ||
200 | } | ||
201 | |||
202 | left->header.nr_entries = | ||
203 | cpu_to_le32(le32_to_cpu(left->header.nr_entries) - count); | ||
204 | BUG_ON(le32_to_cpu(left->header.nr_entries) > le32_to_cpu(left->header.max_entries)); | ||
205 | |||
206 | right->header.nr_entries = | ||
207 | cpu_to_le32(le32_to_cpu(right->header.nr_entries) + count); | ||
208 | BUG_ON(le32_to_cpu(right->header.nr_entries) > le32_to_cpu(right->header.max_entries)); | ||
209 | } | ||
210 | |||
211 | static void __rebalance2(struct dm_btree_info *info, struct node *parent, | ||
212 | struct child *l, struct child *r) | ||
213 | { | ||
214 | struct node *left = l->n; | ||
215 | struct node *right = r->n; | ||
216 | uint32_t nr_left = le32_to_cpu(left->header.nr_entries); | ||
217 | uint32_t nr_right = le32_to_cpu(right->header.nr_entries); | ||
218 | |||
219 | if (nr_left + nr_right <= merge_threshold(left)) { | ||
220 | /* | ||
221 | * Merge | ||
222 | */ | ||
223 | node_copy(left, right, -nr_right); | ||
224 | left->header.nr_entries = cpu_to_le32(nr_left + nr_right); | ||
225 | delete_at(parent, r->index); | ||
226 | |||
227 | /* | ||
228 | * We need to decrement the right block, but not it's | ||
229 | * children, since they're still referenced by left. | ||
230 | */ | ||
231 | dm_tm_dec(info->tm, dm_block_location(r->block)); | ||
232 | } else { | ||
233 | /* | ||
234 | * Rebalance. | ||
235 | */ | ||
236 | unsigned target_left = (nr_left + nr_right) / 2; | ||
237 | unsigned shift_ = nr_left - target_left; | ||
238 | BUG_ON(le32_to_cpu(left->header.max_entries) <= nr_left - shift_); | ||
239 | BUG_ON(le32_to_cpu(right->header.max_entries) <= nr_right + shift_); | ||
240 | shift(left, right, nr_left - target_left); | ||
241 | *key_ptr(parent, r->index) = right->keys[0]; | ||
242 | } | ||
243 | } | ||
244 | |||
245 | static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, | ||
246 | unsigned left_index) | ||
247 | { | ||
248 | int r; | ||
249 | struct node *parent; | ||
250 | struct child left, right; | ||
251 | |||
252 | parent = dm_block_data(shadow_current(s)); | ||
253 | |||
254 | r = init_child(info, parent, left_index, &left); | ||
255 | if (r) | ||
256 | return r; | ||
257 | |||
258 | r = init_child(info, parent, left_index + 1, &right); | ||
259 | if (r) { | ||
260 | exit_child(info, &left); | ||
261 | return r; | ||
262 | } | ||
263 | |||
264 | __rebalance2(info, parent, &left, &right); | ||
265 | |||
266 | r = exit_child(info, &left); | ||
267 | if (r) { | ||
268 | exit_child(info, &right); | ||
269 | return r; | ||
270 | } | ||
271 | |||
272 | return exit_child(info, &right); | ||
273 | } | ||
274 | |||
275 | static void __rebalance3(struct dm_btree_info *info, struct node *parent, | ||
276 | struct child *l, struct child *c, struct child *r) | ||
277 | { | ||
278 | struct node *left = l->n; | ||
279 | struct node *center = c->n; | ||
280 | struct node *right = r->n; | ||
281 | |||
282 | uint32_t nr_left = le32_to_cpu(left->header.nr_entries); | ||
283 | uint32_t nr_center = le32_to_cpu(center->header.nr_entries); | ||
284 | uint32_t nr_right = le32_to_cpu(right->header.nr_entries); | ||
285 | uint32_t max_entries = le32_to_cpu(left->header.max_entries); | ||
286 | |||
287 | unsigned target; | ||
288 | |||
289 | BUG_ON(left->header.max_entries != center->header.max_entries); | ||
290 | BUG_ON(center->header.max_entries != right->header.max_entries); | ||
291 | |||
292 | if (((nr_left + nr_center + nr_right) / 2) < merge_threshold(center)) { | ||
293 | /* | ||
294 | * Delete center node: | ||
295 | * | ||
296 | * We dump as many entries from center as possible into | ||
297 | * left, then the rest in right, then rebalance2. This | ||
298 | * wastes some cpu, but I want something simple atm. | ||
299 | */ | ||
300 | unsigned shift = min(max_entries - nr_left, nr_center); | ||
301 | |||
302 | BUG_ON(nr_left + shift > max_entries); | ||
303 | node_copy(left, center, -shift); | ||
304 | left->header.nr_entries = cpu_to_le32(nr_left + shift); | ||
305 | |||
306 | if (shift != nr_center) { | ||
307 | shift = nr_center - shift; | ||
308 | BUG_ON((nr_right + shift) >= max_entries); | ||
309 | node_shift(right, shift); | ||
310 | node_copy(center, right, shift); | ||
311 | right->header.nr_entries = cpu_to_le32(nr_right + shift); | ||
312 | } | ||
313 | *key_ptr(parent, r->index) = right->keys[0]; | ||
314 | |||
315 | delete_at(parent, c->index); | ||
316 | r->index--; | ||
317 | |||
318 | dm_tm_dec(info->tm, dm_block_location(c->block)); | ||
319 | __rebalance2(info, parent, l, r); | ||
320 | |||
321 | return; | ||
322 | } | ||
323 | |||
324 | /* | ||
325 | * Rebalance | ||
326 | */ | ||
327 | target = (nr_left + nr_center + nr_right) / 3; | ||
328 | BUG_ON(target > max_entries); | ||
329 | |||
330 | /* | ||
331 | * Adjust the left node | ||
332 | */ | ||
333 | shift(left, center, nr_left - target); | ||
334 | |||
335 | /* | ||
336 | * Adjust the right node | ||
337 | */ | ||
338 | shift(center, right, target - nr_right); | ||
339 | *key_ptr(parent, c->index) = center->keys[0]; | ||
340 | *key_ptr(parent, r->index) = right->keys[0]; | ||
341 | } | ||
342 | |||
343 | static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, | ||
344 | unsigned left_index) | ||
345 | { | ||
346 | int r; | ||
347 | struct node *parent = dm_block_data(shadow_current(s)); | ||
348 | struct child left, center, right; | ||
349 | |||
350 | /* | ||
351 | * FIXME: fill out an array? | ||
352 | */ | ||
353 | r = init_child(info, parent, left_index, &left); | ||
354 | if (r) | ||
355 | return r; | ||
356 | |||
357 | r = init_child(info, parent, left_index + 1, ¢er); | ||
358 | if (r) { | ||
359 | exit_child(info, &left); | ||
360 | return r; | ||
361 | } | ||
362 | |||
363 | r = init_child(info, parent, left_index + 2, &right); | ||
364 | if (r) { | ||
365 | exit_child(info, &left); | ||
366 | exit_child(info, ¢er); | ||
367 | return r; | ||
368 | } | ||
369 | |||
370 | __rebalance3(info, parent, &left, ¢er, &right); | ||
371 | |||
372 | r = exit_child(info, &left); | ||
373 | if (r) { | ||
374 | exit_child(info, ¢er); | ||
375 | exit_child(info, &right); | ||
376 | return r; | ||
377 | } | ||
378 | |||
379 | r = exit_child(info, ¢er); | ||
380 | if (r) { | ||
381 | exit_child(info, &right); | ||
382 | return r; | ||
383 | } | ||
384 | |||
385 | r = exit_child(info, &right); | ||
386 | if (r) | ||
387 | return r; | ||
388 | |||
389 | return 0; | ||
390 | } | ||
391 | |||
392 | static int get_nr_entries(struct dm_transaction_manager *tm, | ||
393 | dm_block_t b, uint32_t *result) | ||
394 | { | ||
395 | int r; | ||
396 | struct dm_block *block; | ||
397 | struct node *n; | ||
398 | |||
399 | r = dm_tm_read_lock(tm, b, &btree_node_validator, &block); | ||
400 | if (r) | ||
401 | return r; | ||
402 | |||
403 | n = dm_block_data(block); | ||
404 | *result = le32_to_cpu(n->header.nr_entries); | ||
405 | |||
406 | return dm_tm_unlock(tm, block); | ||
407 | } | ||
408 | |||
409 | static int rebalance_children(struct shadow_spine *s, | ||
410 | struct dm_btree_info *info, uint64_t key) | ||
411 | { | ||
412 | int i, r, has_left_sibling, has_right_sibling; | ||
413 | uint32_t child_entries; | ||
414 | struct node *n; | ||
415 | |||
416 | n = dm_block_data(shadow_current(s)); | ||
417 | |||
418 | if (le32_to_cpu(n->header.nr_entries) == 1) { | ||
419 | struct dm_block *child; | ||
420 | dm_block_t b = value64(n, 0); | ||
421 | |||
422 | r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child); | ||
423 | if (r) | ||
424 | return r; | ||
425 | |||
426 | memcpy(n, dm_block_data(child), | ||
427 | dm_bm_block_size(dm_tm_get_bm(info->tm))); | ||
428 | r = dm_tm_unlock(info->tm, child); | ||
429 | if (r) | ||
430 | return r; | ||
431 | |||
432 | dm_tm_dec(info->tm, dm_block_location(child)); | ||
433 | return 0; | ||
434 | } | ||
435 | |||
436 | i = lower_bound(n, key); | ||
437 | if (i < 0) | ||
438 | return -ENODATA; | ||
439 | |||
440 | r = get_nr_entries(info->tm, value64(n, i), &child_entries); | ||
441 | if (r) | ||
442 | return r; | ||
443 | |||
444 | if (child_entries > del_threshold(n)) | ||
445 | return 0; | ||
446 | |||
447 | has_left_sibling = i > 0; | ||
448 | has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); | ||
449 | |||
450 | if (!has_left_sibling) | ||
451 | r = rebalance2(s, info, i); | ||
452 | |||
453 | else if (!has_right_sibling) | ||
454 | r = rebalance2(s, info, i - 1); | ||
455 | |||
456 | else | ||
457 | r = rebalance3(s, info, i - 1); | ||
458 | |||
459 | return r; | ||
460 | } | ||
461 | |||
462 | static int do_leaf(struct node *n, uint64_t key, unsigned *index) | ||
463 | { | ||
464 | int i = lower_bound(n, key); | ||
465 | |||
466 | if ((i < 0) || | ||
467 | (i >= le32_to_cpu(n->header.nr_entries)) || | ||
468 | (le64_to_cpu(n->keys[i]) != key)) | ||
469 | return -ENODATA; | ||
470 | |||
471 | *index = i; | ||
472 | |||
473 | return 0; | ||
474 | } | ||
475 | |||
476 | /* | ||
477 | * Prepares for removal from one level of the hierarchy. The caller must | ||
478 | * call delete_at() to remove the entry at index. | ||
479 | */ | ||
480 | static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, | ||
481 | struct dm_btree_value_type *vt, dm_block_t root, | ||
482 | uint64_t key, unsigned *index) | ||
483 | { | ||
484 | int i = *index, r; | ||
485 | struct node *n; | ||
486 | |||
487 | for (;;) { | ||
488 | r = shadow_step(s, root, vt); | ||
489 | if (r < 0) | ||
490 | break; | ||
491 | |||
492 | /* | ||
493 | * We have to patch up the parent node, ugly, but I don't | ||
494 | * see a way to do this automatically as part of the spine | ||
495 | * op. | ||
496 | */ | ||
497 | if (shadow_has_parent(s)) { | ||
498 | __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); | ||
499 | memcpy(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(__le64)), | ||
500 | &location, sizeof(__le64)); | ||
501 | } | ||
502 | |||
503 | n = dm_block_data(shadow_current(s)); | ||
504 | |||
505 | if (le32_to_cpu(n->header.flags) & LEAF_NODE) | ||
506 | return do_leaf(n, key, index); | ||
507 | |||
508 | r = rebalance_children(s, info, key); | ||
509 | if (r) | ||
510 | break; | ||
511 | |||
512 | n = dm_block_data(shadow_current(s)); | ||
513 | if (le32_to_cpu(n->header.flags) & LEAF_NODE) | ||
514 | return do_leaf(n, key, index); | ||
515 | |||
516 | i = lower_bound(n, key); | ||
517 | |||
518 | /* | ||
519 | * We know the key is present, or else | ||
520 | * rebalance_children would have returned | ||
521 | * -ENODATA | ||
522 | */ | ||
523 | root = value64(n, i); | ||
524 | } | ||
525 | |||
526 | return r; | ||
527 | } | ||
528 | |||
529 | int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, | ||
530 | uint64_t *keys, dm_block_t *new_root) | ||
531 | { | ||
532 | unsigned level, last_level = info->levels - 1; | ||
533 | int index = 0, r = 0; | ||
534 | struct shadow_spine spine; | ||
535 | struct node *n; | ||
536 | |||
537 | init_shadow_spine(&spine, info); | ||
538 | for (level = 0; level < info->levels; level++) { | ||
539 | r = remove_raw(&spine, info, | ||
540 | (level == last_level ? | ||
541 | &info->value_type : &le64_type), | ||
542 | root, keys[level], (unsigned *)&index); | ||
543 | if (r < 0) | ||
544 | break; | ||
545 | |||
546 | n = dm_block_data(shadow_current(&spine)); | ||
547 | if (level != last_level) { | ||
548 | root = value64(n, index); | ||
549 | continue; | ||
550 | } | ||
551 | |||
552 | BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries)); | ||
553 | |||
554 | if (info->value_type.dec) | ||
555 | info->value_type.dec(info->value_type.context, | ||
556 | value_ptr(n, index, info->value_type.size)); | ||
557 | |||
558 | delete_at(n, index); | ||
559 | } | ||
560 | |||
561 | *new_root = shadow_root(&spine); | ||
562 | exit_shadow_spine(&spine); | ||
563 | |||
564 | return r; | ||
565 | } | ||
566 | EXPORT_SYMBOL_GPL(dm_btree_remove); | ||