diff options
Diffstat (limited to 'lib/radix-tree.c')
-rw-r--r-- | lib/radix-tree.c | 153 |
1 files changed, 111 insertions, 42 deletions
diff --git a/lib/radix-tree.c b/lib/radix-tree.c index e907858498a..5086bb962b4 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c | |||
@@ -49,7 +49,7 @@ struct radix_tree_node { | |||
49 | unsigned int height; /* Height from the bottom */ | 49 | unsigned int height; /* Height from the bottom */ |
50 | unsigned int count; | 50 | unsigned int count; |
51 | struct rcu_head rcu_head; | 51 | struct rcu_head rcu_head; |
52 | void *slots[RADIX_TREE_MAP_SIZE]; | 52 | void __rcu *slots[RADIX_TREE_MAP_SIZE]; |
53 | unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; | 53 | unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; |
54 | }; | 54 | }; |
55 | 55 | ||
@@ -82,6 +82,16 @@ struct radix_tree_preload { | |||
82 | }; | 82 | }; |
83 | static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; | 83 | static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; |
84 | 84 | ||
85 | static inline void *ptr_to_indirect(void *ptr) | ||
86 | { | ||
87 | return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR); | ||
88 | } | ||
89 | |||
90 | static inline void *indirect_to_ptr(void *ptr) | ||
91 | { | ||
92 | return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR); | ||
93 | } | ||
94 | |||
85 | static inline gfp_t root_gfp_mask(struct radix_tree_root *root) | 95 | static inline gfp_t root_gfp_mask(struct radix_tree_root *root) |
86 | { | 96 | { |
87 | return root->gfp_mask & __GFP_BITS_MASK; | 97 | return root->gfp_mask & __GFP_BITS_MASK; |
@@ -174,14 +184,16 @@ static void radix_tree_node_rcu_free(struct rcu_head *head) | |||
174 | { | 184 | { |
175 | struct radix_tree_node *node = | 185 | struct radix_tree_node *node = |
176 | container_of(head, struct radix_tree_node, rcu_head); | 186 | container_of(head, struct radix_tree_node, rcu_head); |
187 | int i; | ||
177 | 188 | ||
178 | /* | 189 | /* |
179 | * must only free zeroed nodes into the slab. radix_tree_shrink | 190 | * must only free zeroed nodes into the slab. radix_tree_shrink |
180 | * can leave us with a non-NULL entry in the first slot, so clear | 191 | * can leave us with a non-NULL entry in the first slot, so clear |
181 | * that here to make sure. | 192 | * that here to make sure. |
182 | */ | 193 | */ |
183 | tag_clear(node, 0, 0); | 194 | for (i = 0; i < RADIX_TREE_MAX_TAGS; i++) |
184 | tag_clear(node, 1, 0); | 195 | tag_clear(node, i, 0); |
196 | |||
185 | node->slots[0] = NULL; | 197 | node->slots[0] = NULL; |
186 | node->count = 0; | 198 | node->count = 0; |
187 | 199 | ||
@@ -263,7 +275,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) | |||
263 | return -ENOMEM; | 275 | return -ENOMEM; |
264 | 276 | ||
265 | /* Increase the height. */ | 277 | /* Increase the height. */ |
266 | node->slots[0] = radix_tree_indirect_to_ptr(root->rnode); | 278 | node->slots[0] = indirect_to_ptr(root->rnode); |
267 | 279 | ||
268 | /* Propagate the aggregated tag info into the new root */ | 280 | /* Propagate the aggregated tag info into the new root */ |
269 | for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { | 281 | for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { |
@@ -274,7 +286,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) | |||
274 | newheight = root->height+1; | 286 | newheight = root->height+1; |
275 | node->height = newheight; | 287 | node->height = newheight; |
276 | node->count = 1; | 288 | node->count = 1; |
277 | node = radix_tree_ptr_to_indirect(node); | 289 | node = ptr_to_indirect(node); |
278 | rcu_assign_pointer(root->rnode, node); | 290 | rcu_assign_pointer(root->rnode, node); |
279 | root->height = newheight; | 291 | root->height = newheight; |
280 | } while (height > root->height); | 292 | } while (height > root->height); |
@@ -307,7 +319,7 @@ int radix_tree_insert(struct radix_tree_root *root, | |||
307 | return error; | 319 | return error; |
308 | } | 320 | } |
309 | 321 | ||
310 | slot = radix_tree_indirect_to_ptr(root->rnode); | 322 | slot = indirect_to_ptr(root->rnode); |
311 | 323 | ||
312 | height = root->height; | 324 | height = root->height; |
313 | shift = (height-1) * RADIX_TREE_MAP_SHIFT; | 325 | shift = (height-1) * RADIX_TREE_MAP_SHIFT; |
@@ -323,8 +335,7 @@ int radix_tree_insert(struct radix_tree_root *root, | |||
323 | rcu_assign_pointer(node->slots[offset], slot); | 335 | rcu_assign_pointer(node->slots[offset], slot); |
324 | node->count++; | 336 | node->count++; |
325 | } else | 337 | } else |
326 | rcu_assign_pointer(root->rnode, | 338 | rcu_assign_pointer(root->rnode, ptr_to_indirect(slot)); |
327 | radix_tree_ptr_to_indirect(slot)); | ||
328 | } | 339 | } |
329 | 340 | ||
330 | /* Go a level down */ | 341 | /* Go a level down */ |
@@ -372,7 +383,7 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root, | |||
372 | return NULL; | 383 | return NULL; |
373 | return is_slot ? (void *)&root->rnode : node; | 384 | return is_slot ? (void *)&root->rnode : node; |
374 | } | 385 | } |
375 | node = radix_tree_indirect_to_ptr(node); | 386 | node = indirect_to_ptr(node); |
376 | 387 | ||
377 | height = node->height; | 388 | height = node->height; |
378 | if (index > radix_tree_maxindex(height)) | 389 | if (index > radix_tree_maxindex(height)) |
@@ -391,7 +402,7 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root, | |||
391 | height--; | 402 | height--; |
392 | } while (height > 0); | 403 | } while (height > 0); |
393 | 404 | ||
394 | return is_slot ? (void *)slot:node; | 405 | return is_slot ? (void *)slot : indirect_to_ptr(node); |
395 | } | 406 | } |
396 | 407 | ||
397 | /** | 408 | /** |
@@ -453,7 +464,7 @@ void *radix_tree_tag_set(struct radix_tree_root *root, | |||
453 | height = root->height; | 464 | height = root->height; |
454 | BUG_ON(index > radix_tree_maxindex(height)); | 465 | BUG_ON(index > radix_tree_maxindex(height)); |
455 | 466 | ||
456 | slot = radix_tree_indirect_to_ptr(root->rnode); | 467 | slot = indirect_to_ptr(root->rnode); |
457 | shift = (height - 1) * RADIX_TREE_MAP_SHIFT; | 468 | shift = (height - 1) * RADIX_TREE_MAP_SHIFT; |
458 | 469 | ||
459 | while (height > 0) { | 470 | while (height > 0) { |
@@ -507,7 +518,7 @@ void *radix_tree_tag_clear(struct radix_tree_root *root, | |||
507 | 518 | ||
508 | shift = (height - 1) * RADIX_TREE_MAP_SHIFT; | 519 | shift = (height - 1) * RADIX_TREE_MAP_SHIFT; |
509 | pathp->node = NULL; | 520 | pathp->node = NULL; |
510 | slot = radix_tree_indirect_to_ptr(root->rnode); | 521 | slot = indirect_to_ptr(root->rnode); |
511 | 522 | ||
512 | while (height > 0) { | 523 | while (height > 0) { |
513 | int offset; | 524 | int offset; |
@@ -577,7 +588,7 @@ int radix_tree_tag_get(struct radix_tree_root *root, | |||
577 | 588 | ||
578 | if (!radix_tree_is_indirect_ptr(node)) | 589 | if (!radix_tree_is_indirect_ptr(node)) |
579 | return (index == 0); | 590 | return (index == 0); |
580 | node = radix_tree_indirect_to_ptr(node); | 591 | node = indirect_to_ptr(node); |
581 | 592 | ||
582 | height = node->height; | 593 | height = node->height; |
583 | if (index > radix_tree_maxindex(height)) | 594 | if (index > radix_tree_maxindex(height)) |
@@ -623,17 +634,30 @@ EXPORT_SYMBOL(radix_tree_tag_get); | |||
623 | * also settag. The function stops either after tagging nr_to_tag items or | 634 | * also settag. The function stops either after tagging nr_to_tag items or |
624 | * after reaching last_index. | 635 | * after reaching last_index. |
625 | * | 636 | * |
637 | * The tags must be set from the leaf level only and propagated back up the | ||
638 | * path to the root. We must do this so that we resolve the full path before | ||
639 | * setting any tags on intermediate nodes. If we set tags as we descend, then | ||
640 | * we can get to the leaf node and find that the index that has the iftag | ||
641 | * set is outside the range we are scanning. This reults in dangling tags and | ||
642 | * can lead to problems with later tag operations (e.g. livelocks on lookups). | ||
643 | * | ||
626 | * The function returns number of leaves where the tag was set and sets | 644 | * The function returns number of leaves where the tag was set and sets |
627 | * *first_indexp to the first unscanned index. | 645 | * *first_indexp to the first unscanned index. |
646 | * WARNING! *first_indexp can wrap if last_index is ULONG_MAX. Caller must | ||
647 | * be prepared to handle that. | ||
628 | */ | 648 | */ |
629 | unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, | 649 | unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, |
630 | unsigned long *first_indexp, unsigned long last_index, | 650 | unsigned long *first_indexp, unsigned long last_index, |
631 | unsigned long nr_to_tag, | 651 | unsigned long nr_to_tag, |
632 | unsigned int iftag, unsigned int settag) | 652 | unsigned int iftag, unsigned int settag) |
633 | { | 653 | { |
634 | unsigned int height = root->height, shift; | 654 | unsigned int height = root->height; |
635 | unsigned long tagged = 0, index = *first_indexp; | 655 | struct radix_tree_path path[height]; |
636 | struct radix_tree_node *open_slots[height], *slot; | 656 | struct radix_tree_path *pathp = path; |
657 | struct radix_tree_node *slot; | ||
658 | unsigned int shift; | ||
659 | unsigned long tagged = 0; | ||
660 | unsigned long index = *first_indexp; | ||
637 | 661 | ||
638 | last_index = min(last_index, radix_tree_maxindex(height)); | 662 | last_index = min(last_index, radix_tree_maxindex(height)); |
639 | if (index > last_index) | 663 | if (index > last_index) |
@@ -651,7 +675,14 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, | |||
651 | } | 675 | } |
652 | 676 | ||
653 | shift = (height - 1) * RADIX_TREE_MAP_SHIFT; | 677 | shift = (height - 1) * RADIX_TREE_MAP_SHIFT; |
654 | slot = radix_tree_indirect_to_ptr(root->rnode); | 678 | slot = indirect_to_ptr(root->rnode); |
679 | |||
680 | /* | ||
681 | * we fill the path from (root->height - 2) to 0, leaving the index at | ||
682 | * (root->height - 1) as a terminator. Zero the node in the terminator | ||
683 | * so that we can use this to end walk loops back up the path. | ||
684 | */ | ||
685 | path[height - 1].node = NULL; | ||
655 | 686 | ||
656 | for (;;) { | 687 | for (;;) { |
657 | int offset; | 688 | int offset; |
@@ -661,21 +692,35 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, | |||
661 | goto next; | 692 | goto next; |
662 | if (!tag_get(slot, iftag, offset)) | 693 | if (!tag_get(slot, iftag, offset)) |
663 | goto next; | 694 | goto next; |
695 | if (height > 1) { | ||
696 | /* Go down one level */ | ||
697 | height--; | ||
698 | shift -= RADIX_TREE_MAP_SHIFT; | ||
699 | path[height - 1].node = slot; | ||
700 | path[height - 1].offset = offset; | ||
701 | slot = slot->slots[offset]; | ||
702 | continue; | ||
703 | } | ||
704 | |||
705 | /* tag the leaf */ | ||
706 | tagged++; | ||
664 | tag_set(slot, settag, offset); | 707 | tag_set(slot, settag, offset); |
665 | if (height == 1) { | 708 | |
666 | tagged++; | 709 | /* walk back up the path tagging interior nodes */ |
667 | goto next; | 710 | pathp = &path[0]; |
711 | while (pathp->node) { | ||
712 | /* stop if we find a node with the tag already set */ | ||
713 | if (tag_get(pathp->node, settag, pathp->offset)) | ||
714 | break; | ||
715 | tag_set(pathp->node, settag, pathp->offset); | ||
716 | pathp++; | ||
668 | } | 717 | } |
669 | /* Go down one level */ | 718 | |
670 | height--; | ||
671 | shift -= RADIX_TREE_MAP_SHIFT; | ||
672 | open_slots[height] = slot; | ||
673 | slot = slot->slots[offset]; | ||
674 | continue; | ||
675 | next: | 719 | next: |
676 | /* Go to next item at level determined by 'shift' */ | 720 | /* Go to next item at level determined by 'shift' */ |
677 | index = ((index >> shift) + 1) << shift; | 721 | index = ((index >> shift) + 1) << shift; |
678 | if (index > last_index) | 722 | /* Overflow can happen when last_index is ~0UL... */ |
723 | if (index > last_index || !index) | ||
679 | break; | 724 | break; |
680 | if (tagged >= nr_to_tag) | 725 | if (tagged >= nr_to_tag) |
681 | break; | 726 | break; |
@@ -685,7 +730,7 @@ next: | |||
685 | * last_index is guaranteed to be in the tree, what | 730 | * last_index is guaranteed to be in the tree, what |
686 | * we do below cannot wander astray. | 731 | * we do below cannot wander astray. |
687 | */ | 732 | */ |
688 | slot = open_slots[height]; | 733 | slot = path[height - 1].node; |
689 | height++; | 734 | height++; |
690 | shift += RADIX_TREE_MAP_SHIFT; | 735 | shift += RADIX_TREE_MAP_SHIFT; |
691 | } | 736 | } |
@@ -861,7 +906,7 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results, | |||
861 | results[0] = node; | 906 | results[0] = node; |
862 | return 1; | 907 | return 1; |
863 | } | 908 | } |
864 | node = radix_tree_indirect_to_ptr(node); | 909 | node = indirect_to_ptr(node); |
865 | 910 | ||
866 | max_index = radix_tree_maxindex(node->height); | 911 | max_index = radix_tree_maxindex(node->height); |
867 | 912 | ||
@@ -880,7 +925,8 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results, | |||
880 | slot = *(((void ***)results)[ret + i]); | 925 | slot = *(((void ***)results)[ret + i]); |
881 | if (!slot) | 926 | if (!slot) |
882 | continue; | 927 | continue; |
883 | results[ret + nr_found] = rcu_dereference_raw(slot); | 928 | results[ret + nr_found] = |
929 | indirect_to_ptr(rcu_dereference_raw(slot)); | ||
884 | nr_found++; | 930 | nr_found++; |
885 | } | 931 | } |
886 | ret += nr_found; | 932 | ret += nr_found; |
@@ -929,7 +975,7 @@ radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, | |||
929 | results[0] = (void **)&root->rnode; | 975 | results[0] = (void **)&root->rnode; |
930 | return 1; | 976 | return 1; |
931 | } | 977 | } |
932 | node = radix_tree_indirect_to_ptr(node); | 978 | node = indirect_to_ptr(node); |
933 | 979 | ||
934 | max_index = radix_tree_maxindex(node->height); | 980 | max_index = radix_tree_maxindex(node->height); |
935 | 981 | ||
@@ -1054,7 +1100,7 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, | |||
1054 | results[0] = node; | 1100 | results[0] = node; |
1055 | return 1; | 1101 | return 1; |
1056 | } | 1102 | } |
1057 | node = radix_tree_indirect_to_ptr(node); | 1103 | node = indirect_to_ptr(node); |
1058 | 1104 | ||
1059 | max_index = radix_tree_maxindex(node->height); | 1105 | max_index = radix_tree_maxindex(node->height); |
1060 | 1106 | ||
@@ -1073,7 +1119,8 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, | |||
1073 | slot = *(((void ***)results)[ret + i]); | 1119 | slot = *(((void ***)results)[ret + i]); |
1074 | if (!slot) | 1120 | if (!slot) |
1075 | continue; | 1121 | continue; |
1076 | results[ret + nr_found] = rcu_dereference_raw(slot); | 1122 | results[ret + nr_found] = |
1123 | indirect_to_ptr(rcu_dereference_raw(slot)); | ||
1077 | nr_found++; | 1124 | nr_found++; |
1078 | } | 1125 | } |
1079 | ret += nr_found; | 1126 | ret += nr_found; |
@@ -1123,7 +1170,7 @@ radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results, | |||
1123 | results[0] = (void **)&root->rnode; | 1170 | results[0] = (void **)&root->rnode; |
1124 | return 1; | 1171 | return 1; |
1125 | } | 1172 | } |
1126 | node = radix_tree_indirect_to_ptr(node); | 1173 | node = indirect_to_ptr(node); |
1127 | 1174 | ||
1128 | max_index = radix_tree_maxindex(node->height); | 1175 | max_index = radix_tree_maxindex(node->height); |
1129 | 1176 | ||
@@ -1159,7 +1206,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) | |||
1159 | void *newptr; | 1206 | void *newptr; |
1160 | 1207 | ||
1161 | BUG_ON(!radix_tree_is_indirect_ptr(to_free)); | 1208 | BUG_ON(!radix_tree_is_indirect_ptr(to_free)); |
1162 | to_free = radix_tree_indirect_to_ptr(to_free); | 1209 | to_free = indirect_to_ptr(to_free); |
1163 | 1210 | ||
1164 | /* | 1211 | /* |
1165 | * The candidate node has more than one child, or its child | 1212 | * The candidate node has more than one child, or its child |
@@ -1172,16 +1219,39 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) | |||
1172 | 1219 | ||
1173 | /* | 1220 | /* |
1174 | * We don't need rcu_assign_pointer(), since we are simply | 1221 | * We don't need rcu_assign_pointer(), since we are simply |
1175 | * moving the node from one part of the tree to another. If | 1222 | * moving the node from one part of the tree to another: if it |
1176 | * it was safe to dereference the old pointer to it | 1223 | * was safe to dereference the old pointer to it |
1177 | * (to_free->slots[0]), it will be safe to dereference the new | 1224 | * (to_free->slots[0]), it will be safe to dereference the new |
1178 | * one (root->rnode). | 1225 | * one (root->rnode) as far as dependent read barriers go. |
1179 | */ | 1226 | */ |
1180 | newptr = to_free->slots[0]; | 1227 | newptr = to_free->slots[0]; |
1181 | if (root->height > 1) | 1228 | if (root->height > 1) |
1182 | newptr = radix_tree_ptr_to_indirect(newptr); | 1229 | newptr = ptr_to_indirect(newptr); |
1183 | root->rnode = newptr; | 1230 | root->rnode = newptr; |
1184 | root->height--; | 1231 | root->height--; |
1232 | |||
1233 | /* | ||
1234 | * We have a dilemma here. The node's slot[0] must not be | ||
1235 | * NULLed in case there are concurrent lookups expecting to | ||
1236 | * find the item. However if this was a bottom-level node, | ||
1237 | * then it may be subject to the slot pointer being visible | ||
1238 | * to callers dereferencing it. If item corresponding to | ||
1239 | * slot[0] is subsequently deleted, these callers would expect | ||
1240 | * their slot to become empty sooner or later. | ||
1241 | * | ||
1242 | * For example, lockless pagecache will look up a slot, deref | ||
1243 | * the page pointer, and if the page is 0 refcount it means it | ||
1244 | * was concurrently deleted from pagecache so try the deref | ||
1245 | * again. Fortunately there is already a requirement for logic | ||
1246 | * to retry the entire slot lookup -- the indirect pointer | ||
1247 | * problem (replacing direct root node with an indirect pointer | ||
1248 | * also results in a stale slot). So tag the slot as indirect | ||
1249 | * to force callers to retry. | ||
1250 | */ | ||
1251 | if (root->height == 0) | ||
1252 | *((unsigned long *)&to_free->slots[0]) |= | ||
1253 | RADIX_TREE_INDIRECT_PTR; | ||
1254 | |||
1185 | radix_tree_node_free(to_free); | 1255 | radix_tree_node_free(to_free); |
1186 | } | 1256 | } |
1187 | } | 1257 | } |
@@ -1218,7 +1288,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) | |||
1218 | root->rnode = NULL; | 1288 | root->rnode = NULL; |
1219 | goto out; | 1289 | goto out; |
1220 | } | 1290 | } |
1221 | slot = radix_tree_indirect_to_ptr(slot); | 1291 | slot = indirect_to_ptr(slot); |
1222 | 1292 | ||
1223 | shift = (height - 1) * RADIX_TREE_MAP_SHIFT; | 1293 | shift = (height - 1) * RADIX_TREE_MAP_SHIFT; |
1224 | pathp->node = NULL; | 1294 | pathp->node = NULL; |
@@ -1260,8 +1330,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) | |||
1260 | radix_tree_node_free(to_free); | 1330 | radix_tree_node_free(to_free); |
1261 | 1331 | ||
1262 | if (pathp->node->count) { | 1332 | if (pathp->node->count) { |
1263 | if (pathp->node == | 1333 | if (pathp->node == indirect_to_ptr(root->rnode)) |
1264 | radix_tree_indirect_to_ptr(root->rnode)) | ||
1265 | radix_tree_shrink(root); | 1334 | radix_tree_shrink(root); |
1266 | goto out; | 1335 | goto out; |
1267 | } | 1336 | } |