aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/radix-tree.h39
-rw-r--r--lib/radix-tree.c83
-rw-r--r--mm/filemap.c26
3 files changed, 91 insertions, 57 deletions
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index a39cbed9ee17..ab2baa5c4884 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -34,19 +34,13 @@
34 * needed for RCU lookups (because root->height is unreliable). The only 34 * needed for RCU lookups (because root->height is unreliable). The only
35 * time callers need worry about this is when doing a lookup_slot under 35 * time callers need worry about this is when doing a lookup_slot under
36 * RCU. 36 * RCU.
37 *
38 * Indirect pointer in fact is also used to tag the last pointer of a node
39 * when it is shrunk, before we rcu free the node. See shrink code for
40 * details.
37 */ 41 */
38#define RADIX_TREE_INDIRECT_PTR 1 42#define RADIX_TREE_INDIRECT_PTR 1
39#define RADIX_TREE_RETRY ((void *)-1UL)
40
41static inline void *radix_tree_ptr_to_indirect(void *ptr)
42{
43 return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR);
44}
45 43
46static inline void *radix_tree_indirect_to_ptr(void *ptr)
47{
48 return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR);
49}
50#define radix_tree_indirect_to_ptr(ptr) \ 44#define radix_tree_indirect_to_ptr(ptr) \
51 radix_tree_indirect_to_ptr((void __force *)(ptr)) 45 radix_tree_indirect_to_ptr((void __force *)(ptr))
52 46
@@ -140,16 +134,29 @@ do { \
140 * removed. 134 * removed.
141 * 135 *
142 * For use with radix_tree_lookup_slot(). Caller must hold tree at least read 136 * For use with radix_tree_lookup_slot(). Caller must hold tree at least read
143 * locked across slot lookup and dereference. More likely, will be used with 137 * locked across slot lookup and dereference. Not required if write lock is
144 * radix_tree_replace_slot(), as well, so caller will hold tree write locked. 138 * held (ie. items cannot be concurrently inserted).
139 *
140 * radix_tree_deref_retry must be used to confirm validity of the pointer if
141 * only the read lock is held.
145 */ 142 */
146static inline void *radix_tree_deref_slot(void **pslot) 143static inline void *radix_tree_deref_slot(void **pslot)
147{ 144{
148 void *ret = rcu_dereference(*pslot); 145 return rcu_dereference(*pslot);
149 if (unlikely(radix_tree_is_indirect_ptr(ret)))
150 ret = RADIX_TREE_RETRY;
151 return ret;
152} 146}
147
148/**
149 * radix_tree_deref_retry - check radix_tree_deref_slot
150 * @arg: pointer returned by radix_tree_deref_slot
151 * Returns: 0 if retry is not required, otherwise retry is required
152 *
153 * radix_tree_deref_retry must be used with radix_tree_deref_slot.
154 */
155static inline int radix_tree_deref_retry(void *arg)
156{
157 return unlikely((unsigned long)arg & RADIX_TREE_INDIRECT_PTR);
158}
159
153/** 160/**
154 * radix_tree_replace_slot - replace item in a slot 161 * radix_tree_replace_slot - replace item in a slot
155 * @pslot: pointer to slot, returned by radix_tree_lookup_slot 162 * @pslot: pointer to slot, returned by radix_tree_lookup_slot
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 6f412ab4c24f..5086bb962b4d 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -82,6 +82,16 @@ struct radix_tree_preload {
82}; 82};
83static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; 83static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
84 84
85static inline void *ptr_to_indirect(void *ptr)
86{
87 return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR);
88}
89
90static inline void *indirect_to_ptr(void *ptr)
91{
92 return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR);
93}
94
85static inline gfp_t root_gfp_mask(struct radix_tree_root *root) 95static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
86{ 96{
87 return root->gfp_mask & __GFP_BITS_MASK; 97 return root->gfp_mask & __GFP_BITS_MASK;
@@ -265,7 +275,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
265 return -ENOMEM; 275 return -ENOMEM;
266 276
267 /* Increase the height. */ 277 /* Increase the height. */
268 node->slots[0] = radix_tree_indirect_to_ptr(root->rnode); 278 node->slots[0] = indirect_to_ptr(root->rnode);
269 279
270 /* Propagate the aggregated tag info into the new root */ 280 /* Propagate the aggregated tag info into the new root */
271 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { 281 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
@@ -276,7 +286,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
276 newheight = root->height+1; 286 newheight = root->height+1;
277 node->height = newheight; 287 node->height = newheight;
278 node->count = 1; 288 node->count = 1;
279 node = radix_tree_ptr_to_indirect(node); 289 node = ptr_to_indirect(node);
280 rcu_assign_pointer(root->rnode, node); 290 rcu_assign_pointer(root->rnode, node);
281 root->height = newheight; 291 root->height = newheight;
282 } while (height > root->height); 292 } while (height > root->height);
@@ -309,7 +319,7 @@ int radix_tree_insert(struct radix_tree_root *root,
309 return error; 319 return error;
310 } 320 }
311 321
312 slot = radix_tree_indirect_to_ptr(root->rnode); 322 slot = indirect_to_ptr(root->rnode);
313 323
314 height = root->height; 324 height = root->height;
315 shift = (height-1) * RADIX_TREE_MAP_SHIFT; 325 shift = (height-1) * RADIX_TREE_MAP_SHIFT;
@@ -325,8 +335,7 @@ int radix_tree_insert(struct radix_tree_root *root,
325 rcu_assign_pointer(node->slots[offset], slot); 335 rcu_assign_pointer(node->slots[offset], slot);
326 node->count++; 336 node->count++;
327 } else 337 } else
328 rcu_assign_pointer(root->rnode, 338 rcu_assign_pointer(root->rnode, ptr_to_indirect(slot));
329 radix_tree_ptr_to_indirect(slot));
330 } 339 }
331 340
332 /* Go a level down */ 341 /* Go a level down */
@@ -374,7 +383,7 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root,
374 return NULL; 383 return NULL;
375 return is_slot ? (void *)&root->rnode : node; 384 return is_slot ? (void *)&root->rnode : node;
376 } 385 }
377 node = radix_tree_indirect_to_ptr(node); 386 node = indirect_to_ptr(node);
378 387
379 height = node->height; 388 height = node->height;
380 if (index > radix_tree_maxindex(height)) 389 if (index > radix_tree_maxindex(height))
@@ -393,7 +402,7 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root,
393 height--; 402 height--;
394 } while (height > 0); 403 } while (height > 0);
395 404
396 return is_slot ? (void *)slot:node; 405 return is_slot ? (void *)slot : indirect_to_ptr(node);
397} 406}
398 407
399/** 408/**
@@ -455,7 +464,7 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
455 height = root->height; 464 height = root->height;
456 BUG_ON(index > radix_tree_maxindex(height)); 465 BUG_ON(index > radix_tree_maxindex(height));
457 466
458 slot = radix_tree_indirect_to_ptr(root->rnode); 467 slot = indirect_to_ptr(root->rnode);
459 shift = (height - 1) * RADIX_TREE_MAP_SHIFT; 468 shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
460 469
461 while (height > 0) { 470 while (height > 0) {
@@ -509,7 +518,7 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
509 518
510 shift = (height - 1) * RADIX_TREE_MAP_SHIFT; 519 shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
511 pathp->node = NULL; 520 pathp->node = NULL;
512 slot = radix_tree_indirect_to_ptr(root->rnode); 521 slot = indirect_to_ptr(root->rnode);
513 522
514 while (height > 0) { 523 while (height > 0) {
515 int offset; 524 int offset;
@@ -579,7 +588,7 @@ int radix_tree_tag_get(struct radix_tree_root *root,
579 588
580 if (!radix_tree_is_indirect_ptr(node)) 589 if (!radix_tree_is_indirect_ptr(node))
581 return (index == 0); 590 return (index == 0);
582 node = radix_tree_indirect_to_ptr(node); 591 node = indirect_to_ptr(node);
583 592
584 height = node->height; 593 height = node->height;
585 if (index > radix_tree_maxindex(height)) 594 if (index > radix_tree_maxindex(height))
@@ -666,7 +675,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
666 } 675 }
667 676
668 shift = (height - 1) * RADIX_TREE_MAP_SHIFT; 677 shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
669 slot = radix_tree_indirect_to_ptr(root->rnode); 678 slot = indirect_to_ptr(root->rnode);
670 679
671 /* 680 /*
672 * we fill the path from (root->height - 2) to 0, leaving the index at 681 * we fill the path from (root->height - 2) to 0, leaving the index at
@@ -897,7 +906,7 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
897 results[0] = node; 906 results[0] = node;
898 return 1; 907 return 1;
899 } 908 }
900 node = radix_tree_indirect_to_ptr(node); 909 node = indirect_to_ptr(node);
901 910
902 max_index = radix_tree_maxindex(node->height); 911 max_index = radix_tree_maxindex(node->height);
903 912
@@ -916,7 +925,8 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
916 slot = *(((void ***)results)[ret + i]); 925 slot = *(((void ***)results)[ret + i]);
917 if (!slot) 926 if (!slot)
918 continue; 927 continue;
919 results[ret + nr_found] = rcu_dereference_raw(slot); 928 results[ret + nr_found] =
929 indirect_to_ptr(rcu_dereference_raw(slot));
920 nr_found++; 930 nr_found++;
921 } 931 }
922 ret += nr_found; 932 ret += nr_found;
@@ -965,7 +975,7 @@ radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results,
965 results[0] = (void **)&root->rnode; 975 results[0] = (void **)&root->rnode;
966 return 1; 976 return 1;
967 } 977 }
968 node = radix_tree_indirect_to_ptr(node); 978 node = indirect_to_ptr(node);
969 979
970 max_index = radix_tree_maxindex(node->height); 980 max_index = radix_tree_maxindex(node->height);
971 981
@@ -1090,7 +1100,7 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
1090 results[0] = node; 1100 results[0] = node;
1091 return 1; 1101 return 1;
1092 } 1102 }
1093 node = radix_tree_indirect_to_ptr(node); 1103 node = indirect_to_ptr(node);
1094 1104
1095 max_index = radix_tree_maxindex(node->height); 1105 max_index = radix_tree_maxindex(node->height);
1096 1106
@@ -1109,7 +1119,8 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
1109 slot = *(((void ***)results)[ret + i]); 1119 slot = *(((void ***)results)[ret + i]);
1110 if (!slot) 1120 if (!slot)
1111 continue; 1121 continue;
1112 results[ret + nr_found] = rcu_dereference_raw(slot); 1122 results[ret + nr_found] =
1123 indirect_to_ptr(rcu_dereference_raw(slot));
1113 nr_found++; 1124 nr_found++;
1114 } 1125 }
1115 ret += nr_found; 1126 ret += nr_found;
@@ -1159,7 +1170,7 @@ radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
1159 results[0] = (void **)&root->rnode; 1170 results[0] = (void **)&root->rnode;
1160 return 1; 1171 return 1;
1161 } 1172 }
1162 node = radix_tree_indirect_to_ptr(node); 1173 node = indirect_to_ptr(node);
1163 1174
1164 max_index = radix_tree_maxindex(node->height); 1175 max_index = radix_tree_maxindex(node->height);
1165 1176
@@ -1195,7 +1206,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
1195 void *newptr; 1206 void *newptr;
1196 1207
1197 BUG_ON(!radix_tree_is_indirect_ptr(to_free)); 1208 BUG_ON(!radix_tree_is_indirect_ptr(to_free));
1198 to_free = radix_tree_indirect_to_ptr(to_free); 1209 to_free = indirect_to_ptr(to_free);
1199 1210
1200 /* 1211 /*
1201 * The candidate node has more than one child, or its child 1212 * The candidate node has more than one child, or its child
@@ -1208,16 +1219,39 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
1208 1219
1209 /* 1220 /*
1210 * We don't need rcu_assign_pointer(), since we are simply 1221 * We don't need rcu_assign_pointer(), since we are simply
1211 * moving the node from one part of the tree to another. If 1222 * moving the node from one part of the tree to another: if it
1212 * it was safe to dereference the old pointer to it 1223 * was safe to dereference the old pointer to it
1213 * (to_free->slots[0]), it will be safe to dereference the new 1224 * (to_free->slots[0]), it will be safe to dereference the new
1214 * one (root->rnode). 1225 * one (root->rnode) as far as dependent read barriers go.
1215 */ 1226 */
1216 newptr = to_free->slots[0]; 1227 newptr = to_free->slots[0];
1217 if (root->height > 1) 1228 if (root->height > 1)
1218 newptr = radix_tree_ptr_to_indirect(newptr); 1229 newptr = ptr_to_indirect(newptr);
1219 root->rnode = newptr; 1230 root->rnode = newptr;
1220 root->height--; 1231 root->height--;
1232
1233 /*
1234 * We have a dilemma here. The node's slot[0] must not be
1235 * NULLed in case there are concurrent lookups expecting to
1236 * find the item. However if this was a bottom-level node,
1237 * then it may be subject to the slot pointer being visible
1238 * to callers dereferencing it. If item corresponding to
1239 * slot[0] is subsequently deleted, these callers would expect
1240 * their slot to become empty sooner or later.
1241 *
1242 * For example, lockless pagecache will look up a slot, deref
1243 * the page pointer, and if the page is 0 refcount it means it
1244 * was concurrently deleted from pagecache so try the deref
1245 * again. Fortunately there is already a requirement for logic
1246 * to retry the entire slot lookup -- the indirect pointer
1247 * problem (replacing direct root node with an indirect pointer
1248 * also results in a stale slot). So tag the slot as indirect
1249 * to force callers to retry.
1250 */
1251 if (root->height == 0)
1252 *((unsigned long *)&to_free->slots[0]) |=
1253 RADIX_TREE_INDIRECT_PTR;
1254
1221 radix_tree_node_free(to_free); 1255 radix_tree_node_free(to_free);
1222 } 1256 }
1223} 1257}
@@ -1254,7 +1288,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
1254 root->rnode = NULL; 1288 root->rnode = NULL;
1255 goto out; 1289 goto out;
1256 } 1290 }
1257 slot = radix_tree_indirect_to_ptr(slot); 1291 slot = indirect_to_ptr(slot);
1258 1292
1259 shift = (height - 1) * RADIX_TREE_MAP_SHIFT; 1293 shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
1260 pathp->node = NULL; 1294 pathp->node = NULL;
@@ -1296,8 +1330,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
1296 radix_tree_node_free(to_free); 1330 radix_tree_node_free(to_free);
1297 1331
1298 if (pathp->node->count) { 1332 if (pathp->node->count) {
1299 if (pathp->node == 1333 if (pathp->node == indirect_to_ptr(root->rnode))
1300 radix_tree_indirect_to_ptr(root->rnode))
1301 radix_tree_shrink(root); 1334 radix_tree_shrink(root);
1302 goto out; 1335 goto out;
1303 } 1336 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 4ee2e998e937..ea89840fc65f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -644,7 +644,9 @@ repeat:
644 pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); 644 pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
645 if (pagep) { 645 if (pagep) {
646 page = radix_tree_deref_slot(pagep); 646 page = radix_tree_deref_slot(pagep);
647 if (unlikely(!page || page == RADIX_TREE_RETRY)) 647 if (unlikely(!page))
648 goto out;
649 if (radix_tree_deref_retry(page))
648 goto repeat; 650 goto repeat;
649 651
650 if (!page_cache_get_speculative(page)) 652 if (!page_cache_get_speculative(page))
@@ -660,6 +662,7 @@ repeat:
660 goto repeat; 662 goto repeat;
661 } 663 }
662 } 664 }
665out:
663 rcu_read_unlock(); 666 rcu_read_unlock();
664 667
665 return page; 668 return page;
@@ -777,12 +780,11 @@ repeat:
777 page = radix_tree_deref_slot((void **)pages[i]); 780 page = radix_tree_deref_slot((void **)pages[i]);
778 if (unlikely(!page)) 781 if (unlikely(!page))
779 continue; 782 continue;
780 /* 783 if (radix_tree_deref_retry(page)) {
781 * this can only trigger if nr_found == 1, making livelock 784 if (ret)
782 * a non issue. 785 start = pages[ret-1]->index;
783 */
784 if (unlikely(page == RADIX_TREE_RETRY))
785 goto restart; 786 goto restart;
787 }
786 788
787 if (!page_cache_get_speculative(page)) 789 if (!page_cache_get_speculative(page))
788 goto repeat; 790 goto repeat;
@@ -830,11 +832,7 @@ repeat:
830 page = radix_tree_deref_slot((void **)pages[i]); 832 page = radix_tree_deref_slot((void **)pages[i]);
831 if (unlikely(!page)) 833 if (unlikely(!page))
832 continue; 834 continue;
833 /* 835 if (radix_tree_deref_retry(page))
834 * this can only trigger if nr_found == 1, making livelock
835 * a non issue.
836 */
837 if (unlikely(page == RADIX_TREE_RETRY))
838 goto restart; 836 goto restart;
839 837
840 if (page->mapping == NULL || page->index != index) 838 if (page->mapping == NULL || page->index != index)
@@ -887,11 +885,7 @@ repeat:
887 page = radix_tree_deref_slot((void **)pages[i]); 885 page = radix_tree_deref_slot((void **)pages[i]);
888 if (unlikely(!page)) 886 if (unlikely(!page))
889 continue; 887 continue;
890 /* 888 if (radix_tree_deref_retry(page))
891 * this can only trigger if nr_found == 1, making livelock
892 * a non issue.
893 */
894 if (unlikely(page == RADIX_TREE_RETRY))
895 goto restart; 889 goto restart;
896 890
897 if (!page_cache_get_speculative(page)) 891 if (!page_cache_get_speculative(page))