aboutsummaryrefslogtreecommitdiffstats
path: root/lib/radix-tree.c
diff options
context:
space:
mode:
Diffstat (limited to 'lib/radix-tree.c')
-rw-r--r--lib/radix-tree.c153
1 files changed, 111 insertions, 42 deletions
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index e907858498a..5086bb962b4 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -49,7 +49,7 @@ struct radix_tree_node {
49 unsigned int height; /* Height from the bottom */ 49 unsigned int height; /* Height from the bottom */
50 unsigned int count; 50 unsigned int count;
51 struct rcu_head rcu_head; 51 struct rcu_head rcu_head;
52 void *slots[RADIX_TREE_MAP_SIZE]; 52 void __rcu *slots[RADIX_TREE_MAP_SIZE];
53 unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; 53 unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
54}; 54};
55 55
@@ -82,6 +82,16 @@ struct radix_tree_preload {
82}; 82};
83static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; 83static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
84 84
85static inline void *ptr_to_indirect(void *ptr)
86{
87 return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR);
88}
89
90static inline void *indirect_to_ptr(void *ptr)
91{
92 return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR);
93}
94
85static inline gfp_t root_gfp_mask(struct radix_tree_root *root) 95static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
86{ 96{
87 return root->gfp_mask & __GFP_BITS_MASK; 97 return root->gfp_mask & __GFP_BITS_MASK;
@@ -174,14 +184,16 @@ static void radix_tree_node_rcu_free(struct rcu_head *head)
174{ 184{
175 struct radix_tree_node *node = 185 struct radix_tree_node *node =
176 container_of(head, struct radix_tree_node, rcu_head); 186 container_of(head, struct radix_tree_node, rcu_head);
187 int i;
177 188
178 /* 189 /*
179 * must only free zeroed nodes into the slab. radix_tree_shrink 190 * must only free zeroed nodes into the slab. radix_tree_shrink
180 * can leave us with a non-NULL entry in the first slot, so clear 191 * can leave us with a non-NULL entry in the first slot, so clear
181 * that here to make sure. 192 * that here to make sure.
182 */ 193 */
183 tag_clear(node, 0, 0); 194 for (i = 0; i < RADIX_TREE_MAX_TAGS; i++)
184 tag_clear(node, 1, 0); 195 tag_clear(node, i, 0);
196
185 node->slots[0] = NULL; 197 node->slots[0] = NULL;
186 node->count = 0; 198 node->count = 0;
187 199
@@ -263,7 +275,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
263 return -ENOMEM; 275 return -ENOMEM;
264 276
265 /* Increase the height. */ 277 /* Increase the height. */
266 node->slots[0] = radix_tree_indirect_to_ptr(root->rnode); 278 node->slots[0] = indirect_to_ptr(root->rnode);
267 279
268 /* Propagate the aggregated tag info into the new root */ 280 /* Propagate the aggregated tag info into the new root */
269 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { 281 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
@@ -274,7 +286,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
274 newheight = root->height+1; 286 newheight = root->height+1;
275 node->height = newheight; 287 node->height = newheight;
276 node->count = 1; 288 node->count = 1;
277 node = radix_tree_ptr_to_indirect(node); 289 node = ptr_to_indirect(node);
278 rcu_assign_pointer(root->rnode, node); 290 rcu_assign_pointer(root->rnode, node);
279 root->height = newheight; 291 root->height = newheight;
280 } while (height > root->height); 292 } while (height > root->height);
@@ -307,7 +319,7 @@ int radix_tree_insert(struct radix_tree_root *root,
307 return error; 319 return error;
308 } 320 }
309 321
310 slot = radix_tree_indirect_to_ptr(root->rnode); 322 slot = indirect_to_ptr(root->rnode);
311 323
312 height = root->height; 324 height = root->height;
313 shift = (height-1) * RADIX_TREE_MAP_SHIFT; 325 shift = (height-1) * RADIX_TREE_MAP_SHIFT;
@@ -323,8 +335,7 @@ int radix_tree_insert(struct radix_tree_root *root,
323 rcu_assign_pointer(node->slots[offset], slot); 335 rcu_assign_pointer(node->slots[offset], slot);
324 node->count++; 336 node->count++;
325 } else 337 } else
326 rcu_assign_pointer(root->rnode, 338 rcu_assign_pointer(root->rnode, ptr_to_indirect(slot));
327 radix_tree_ptr_to_indirect(slot));
328 } 339 }
329 340
330 /* Go a level down */ 341 /* Go a level down */
@@ -372,7 +383,7 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root,
372 return NULL; 383 return NULL;
373 return is_slot ? (void *)&root->rnode : node; 384 return is_slot ? (void *)&root->rnode : node;
374 } 385 }
375 node = radix_tree_indirect_to_ptr(node); 386 node = indirect_to_ptr(node);
376 387
377 height = node->height; 388 height = node->height;
378 if (index > radix_tree_maxindex(height)) 389 if (index > radix_tree_maxindex(height))
@@ -391,7 +402,7 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root,
391 height--; 402 height--;
392 } while (height > 0); 403 } while (height > 0);
393 404
394 return is_slot ? (void *)slot:node; 405 return is_slot ? (void *)slot : indirect_to_ptr(node);
395} 406}
396 407
397/** 408/**
@@ -453,7 +464,7 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
453 height = root->height; 464 height = root->height;
454 BUG_ON(index > radix_tree_maxindex(height)); 465 BUG_ON(index > radix_tree_maxindex(height));
455 466
456 slot = radix_tree_indirect_to_ptr(root->rnode); 467 slot = indirect_to_ptr(root->rnode);
457 shift = (height - 1) * RADIX_TREE_MAP_SHIFT; 468 shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
458 469
459 while (height > 0) { 470 while (height > 0) {
@@ -507,7 +518,7 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
507 518
508 shift = (height - 1) * RADIX_TREE_MAP_SHIFT; 519 shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
509 pathp->node = NULL; 520 pathp->node = NULL;
510 slot = radix_tree_indirect_to_ptr(root->rnode); 521 slot = indirect_to_ptr(root->rnode);
511 522
512 while (height > 0) { 523 while (height > 0) {
513 int offset; 524 int offset;
@@ -577,7 +588,7 @@ int radix_tree_tag_get(struct radix_tree_root *root,
577 588
578 if (!radix_tree_is_indirect_ptr(node)) 589 if (!radix_tree_is_indirect_ptr(node))
579 return (index == 0); 590 return (index == 0);
580 node = radix_tree_indirect_to_ptr(node); 591 node = indirect_to_ptr(node);
581 592
582 height = node->height; 593 height = node->height;
583 if (index > radix_tree_maxindex(height)) 594 if (index > radix_tree_maxindex(height))
@@ -623,17 +634,30 @@ EXPORT_SYMBOL(radix_tree_tag_get);
623 * also settag. The function stops either after tagging nr_to_tag items or 634 * also settag. The function stops either after tagging nr_to_tag items or
624 * after reaching last_index. 635 * after reaching last_index.
625 * 636 *
637 * The tags must be set from the leaf level only and propagated back up the
638 * path to the root. We must do this so that we resolve the full path before
639 * setting any tags on intermediate nodes. If we set tags as we descend, then
640 * we can get to the leaf node and find that the index that has the iftag
641 * set is outside the range we are scanning. This reults in dangling tags and
642 * can lead to problems with later tag operations (e.g. livelocks on lookups).
643 *
626 * The function returns number of leaves where the tag was set and sets 644 * The function returns number of leaves where the tag was set and sets
627 * *first_indexp to the first unscanned index. 645 * *first_indexp to the first unscanned index.
646 * WARNING! *first_indexp can wrap if last_index is ULONG_MAX. Caller must
647 * be prepared to handle that.
628 */ 648 */
629unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, 649unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
630 unsigned long *first_indexp, unsigned long last_index, 650 unsigned long *first_indexp, unsigned long last_index,
631 unsigned long nr_to_tag, 651 unsigned long nr_to_tag,
632 unsigned int iftag, unsigned int settag) 652 unsigned int iftag, unsigned int settag)
633{ 653{
634 unsigned int height = root->height, shift; 654 unsigned int height = root->height;
635 unsigned long tagged = 0, index = *first_indexp; 655 struct radix_tree_path path[height];
636 struct radix_tree_node *open_slots[height], *slot; 656 struct radix_tree_path *pathp = path;
657 struct radix_tree_node *slot;
658 unsigned int shift;
659 unsigned long tagged = 0;
660 unsigned long index = *first_indexp;
637 661
638 last_index = min(last_index, radix_tree_maxindex(height)); 662 last_index = min(last_index, radix_tree_maxindex(height));
639 if (index > last_index) 663 if (index > last_index)
@@ -651,7 +675,14 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
651 } 675 }
652 676
653 shift = (height - 1) * RADIX_TREE_MAP_SHIFT; 677 shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
654 slot = radix_tree_indirect_to_ptr(root->rnode); 678 slot = indirect_to_ptr(root->rnode);
679
680 /*
681 * we fill the path from (root->height - 2) to 0, leaving the index at
682 * (root->height - 1) as a terminator. Zero the node in the terminator
683 * so that we can use this to end walk loops back up the path.
684 */
685 path[height - 1].node = NULL;
655 686
656 for (;;) { 687 for (;;) {
657 int offset; 688 int offset;
@@ -661,21 +692,35 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
661 goto next; 692 goto next;
662 if (!tag_get(slot, iftag, offset)) 693 if (!tag_get(slot, iftag, offset))
663 goto next; 694 goto next;
695 if (height > 1) {
696 /* Go down one level */
697 height--;
698 shift -= RADIX_TREE_MAP_SHIFT;
699 path[height - 1].node = slot;
700 path[height - 1].offset = offset;
701 slot = slot->slots[offset];
702 continue;
703 }
704
705 /* tag the leaf */
706 tagged++;
664 tag_set(slot, settag, offset); 707 tag_set(slot, settag, offset);
665 if (height == 1) { 708
666 tagged++; 709 /* walk back up the path tagging interior nodes */
667 goto next; 710 pathp = &path[0];
711 while (pathp->node) {
712 /* stop if we find a node with the tag already set */
713 if (tag_get(pathp->node, settag, pathp->offset))
714 break;
715 tag_set(pathp->node, settag, pathp->offset);
716 pathp++;
668 } 717 }
669 /* Go down one level */ 718
670 height--;
671 shift -= RADIX_TREE_MAP_SHIFT;
672 open_slots[height] = slot;
673 slot = slot->slots[offset];
674 continue;
675next: 719next:
676 /* Go to next item at level determined by 'shift' */ 720 /* Go to next item at level determined by 'shift' */
677 index = ((index >> shift) + 1) << shift; 721 index = ((index >> shift) + 1) << shift;
678 if (index > last_index) 722 /* Overflow can happen when last_index is ~0UL... */
723 if (index > last_index || !index)
679 break; 724 break;
680 if (tagged >= nr_to_tag) 725 if (tagged >= nr_to_tag)
681 break; 726 break;
@@ -685,7 +730,7 @@ next:
685 * last_index is guaranteed to be in the tree, what 730 * last_index is guaranteed to be in the tree, what
686 * we do below cannot wander astray. 731 * we do below cannot wander astray.
687 */ 732 */
688 slot = open_slots[height]; 733 slot = path[height - 1].node;
689 height++; 734 height++;
690 shift += RADIX_TREE_MAP_SHIFT; 735 shift += RADIX_TREE_MAP_SHIFT;
691 } 736 }
@@ -861,7 +906,7 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
861 results[0] = node; 906 results[0] = node;
862 return 1; 907 return 1;
863 } 908 }
864 node = radix_tree_indirect_to_ptr(node); 909 node = indirect_to_ptr(node);
865 910
866 max_index = radix_tree_maxindex(node->height); 911 max_index = radix_tree_maxindex(node->height);
867 912
@@ -880,7 +925,8 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
880 slot = *(((void ***)results)[ret + i]); 925 slot = *(((void ***)results)[ret + i]);
881 if (!slot) 926 if (!slot)
882 continue; 927 continue;
883 results[ret + nr_found] = rcu_dereference_raw(slot); 928 results[ret + nr_found] =
929 indirect_to_ptr(rcu_dereference_raw(slot));
884 nr_found++; 930 nr_found++;
885 } 931 }
886 ret += nr_found; 932 ret += nr_found;
@@ -929,7 +975,7 @@ radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results,
929 results[0] = (void **)&root->rnode; 975 results[0] = (void **)&root->rnode;
930 return 1; 976 return 1;
931 } 977 }
932 node = radix_tree_indirect_to_ptr(node); 978 node = indirect_to_ptr(node);
933 979
934 max_index = radix_tree_maxindex(node->height); 980 max_index = radix_tree_maxindex(node->height);
935 981
@@ -1054,7 +1100,7 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
1054 results[0] = node; 1100 results[0] = node;
1055 return 1; 1101 return 1;
1056 } 1102 }
1057 node = radix_tree_indirect_to_ptr(node); 1103 node = indirect_to_ptr(node);
1058 1104
1059 max_index = radix_tree_maxindex(node->height); 1105 max_index = radix_tree_maxindex(node->height);
1060 1106
@@ -1073,7 +1119,8 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
1073 slot = *(((void ***)results)[ret + i]); 1119 slot = *(((void ***)results)[ret + i]);
1074 if (!slot) 1120 if (!slot)
1075 continue; 1121 continue;
1076 results[ret + nr_found] = rcu_dereference_raw(slot); 1122 results[ret + nr_found] =
1123 indirect_to_ptr(rcu_dereference_raw(slot));
1077 nr_found++; 1124 nr_found++;
1078 } 1125 }
1079 ret += nr_found; 1126 ret += nr_found;
@@ -1123,7 +1170,7 @@ radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
1123 results[0] = (void **)&root->rnode; 1170 results[0] = (void **)&root->rnode;
1124 return 1; 1171 return 1;
1125 } 1172 }
1126 node = radix_tree_indirect_to_ptr(node); 1173 node = indirect_to_ptr(node);
1127 1174
1128 max_index = radix_tree_maxindex(node->height); 1175 max_index = radix_tree_maxindex(node->height);
1129 1176
@@ -1159,7 +1206,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
1159 void *newptr; 1206 void *newptr;
1160 1207
1161 BUG_ON(!radix_tree_is_indirect_ptr(to_free)); 1208 BUG_ON(!radix_tree_is_indirect_ptr(to_free));
1162 to_free = radix_tree_indirect_to_ptr(to_free); 1209 to_free = indirect_to_ptr(to_free);
1163 1210
1164 /* 1211 /*
1165 * The candidate node has more than one child, or its child 1212 * The candidate node has more than one child, or its child
@@ -1172,16 +1219,39 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
1172 1219
1173 /* 1220 /*
1174 * We don't need rcu_assign_pointer(), since we are simply 1221 * We don't need rcu_assign_pointer(), since we are simply
1175 * moving the node from one part of the tree to another. If 1222 * moving the node from one part of the tree to another: if it
1176 * it was safe to dereference the old pointer to it 1223 * was safe to dereference the old pointer to it
1177 * (to_free->slots[0]), it will be safe to dereference the new 1224 * (to_free->slots[0]), it will be safe to dereference the new
1178 * one (root->rnode). 1225 * one (root->rnode) as far as dependent read barriers go.
1179 */ 1226 */
1180 newptr = to_free->slots[0]; 1227 newptr = to_free->slots[0];
1181 if (root->height > 1) 1228 if (root->height > 1)
1182 newptr = radix_tree_ptr_to_indirect(newptr); 1229 newptr = ptr_to_indirect(newptr);
1183 root->rnode = newptr; 1230 root->rnode = newptr;
1184 root->height--; 1231 root->height--;
1232
1233 /*
1234 * We have a dilemma here. The node's slot[0] must not be
1235 * NULLed in case there are concurrent lookups expecting to
1236 * find the item. However if this was a bottom-level node,
1237 * then it may be subject to the slot pointer being visible
1238 * to callers dereferencing it. If item corresponding to
1239 * slot[0] is subsequently deleted, these callers would expect
1240 * their slot to become empty sooner or later.
1241 *
1242 * For example, lockless pagecache will look up a slot, deref
1243 * the page pointer, and if the page is 0 refcount it means it
1244 * was concurrently deleted from pagecache so try the deref
1245 * again. Fortunately there is already a requirement for logic
1246 * to retry the entire slot lookup -- the indirect pointer
1247 * problem (replacing direct root node with an indirect pointer
1248 * also results in a stale slot). So tag the slot as indirect
1249 * to force callers to retry.
1250 */
1251 if (root->height == 0)
1252 *((unsigned long *)&to_free->slots[0]) |=
1253 RADIX_TREE_INDIRECT_PTR;
1254
1185 radix_tree_node_free(to_free); 1255 radix_tree_node_free(to_free);
1186 } 1256 }
1187} 1257}
@@ -1218,7 +1288,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
1218 root->rnode = NULL; 1288 root->rnode = NULL;
1219 goto out; 1289 goto out;
1220 } 1290 }
1221 slot = radix_tree_indirect_to_ptr(slot); 1291 slot = indirect_to_ptr(slot);
1222 1292
1223 shift = (height - 1) * RADIX_TREE_MAP_SHIFT; 1293 shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
1224 pathp->node = NULL; 1294 pathp->node = NULL;
@@ -1260,8 +1330,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
1260 radix_tree_node_free(to_free); 1330 radix_tree_node_free(to_free);
1261 1331
1262 if (pathp->node->count) { 1332 if (pathp->node->count) {
1263 if (pathp->node == 1333 if (pathp->node == indirect_to_ptr(root->rnode))
1264 radix_tree_indirect_to_ptr(root->rnode))
1265 radix_tree_shrink(root); 1334 radix_tree_shrink(root);
1266 goto out; 1335 goto out;
1267 } 1336 }