diff options
-rw-r--r-- | include/linux/radix-tree.h | 39 | ||||
-rw-r--r-- | lib/radix-tree.c | 83 | ||||
-rw-r--r-- | mm/filemap.c | 26 |
3 files changed, 91 insertions, 57 deletions
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index a39cbed9ee17..ab2baa5c4884 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h | |||
@@ -34,19 +34,13 @@ | |||
34 | * needed for RCU lookups (because root->height is unreliable). The only | 34 | * needed for RCU lookups (because root->height is unreliable). The only |
35 | * time callers need worry about this is when doing a lookup_slot under | 35 | * time callers need worry about this is when doing a lookup_slot under |
36 | * RCU. | 36 | * RCU. |
37 | * | ||
38 | * Indirect pointer in fact is also used to tag the last pointer of a node | ||
39 | * when it is shrunk, before we rcu free the node. See shrink code for | ||
40 | * details. | ||
37 | */ | 41 | */ |
38 | #define RADIX_TREE_INDIRECT_PTR 1 | 42 | #define RADIX_TREE_INDIRECT_PTR 1 |
39 | #define RADIX_TREE_RETRY ((void *)-1UL) | ||
40 | |||
41 | static inline void *radix_tree_ptr_to_indirect(void *ptr) | ||
42 | { | ||
43 | return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR); | ||
44 | } | ||
45 | 43 | ||
46 | static inline void *radix_tree_indirect_to_ptr(void *ptr) | ||
47 | { | ||
48 | return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR); | ||
49 | } | ||
50 | #define radix_tree_indirect_to_ptr(ptr) \ | 44 | #define radix_tree_indirect_to_ptr(ptr) \ |
51 | radix_tree_indirect_to_ptr((void __force *)(ptr)) | 45 | radix_tree_indirect_to_ptr((void __force *)(ptr)) |
52 | 46 | ||
@@ -140,16 +134,29 @@ do { \ | |||
140 | * removed. | 134 | * removed. |
141 | * | 135 | * |
142 | * For use with radix_tree_lookup_slot(). Caller must hold tree at least read | 136 | * For use with radix_tree_lookup_slot(). Caller must hold tree at least read |
143 | * locked across slot lookup and dereference. More likely, will be used with | 137 | * locked across slot lookup and dereference. Not required if write lock is |
144 | * radix_tree_replace_slot(), as well, so caller will hold tree write locked. | 138 | * held (ie. items cannot be concurrently inserted). |
139 | * | ||
140 | * radix_tree_deref_retry must be used to confirm validity of the pointer if | ||
141 | * only the read lock is held. | ||
145 | */ | 142 | */ |
146 | static inline void *radix_tree_deref_slot(void **pslot) | 143 | static inline void *radix_tree_deref_slot(void **pslot) |
147 | { | 144 | { |
148 | void *ret = rcu_dereference(*pslot); | 145 | return rcu_dereference(*pslot); |
149 | if (unlikely(radix_tree_is_indirect_ptr(ret))) | ||
150 | ret = RADIX_TREE_RETRY; | ||
151 | return ret; | ||
152 | } | 146 | } |
147 | |||
148 | /** | ||
149 | * radix_tree_deref_retry - check radix_tree_deref_slot | ||
150 | * @arg: pointer returned by radix_tree_deref_slot | ||
151 | * Returns: 0 if retry is not required, otherwise retry is required | ||
152 | * | ||
153 | * radix_tree_deref_retry must be used with radix_tree_deref_slot. | ||
154 | */ | ||
155 | static inline int radix_tree_deref_retry(void *arg) | ||
156 | { | ||
157 | return unlikely((unsigned long)arg & RADIX_TREE_INDIRECT_PTR); | ||
158 | } | ||
159 | |||
153 | /** | 160 | /** |
154 | * radix_tree_replace_slot - replace item in a slot | 161 | * radix_tree_replace_slot - replace item in a slot |
155 | * @pslot: pointer to slot, returned by radix_tree_lookup_slot | 162 | * @pslot: pointer to slot, returned by radix_tree_lookup_slot |
diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 6f412ab4c24f..5086bb962b4d 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c | |||
@@ -82,6 +82,16 @@ struct radix_tree_preload { | |||
82 | }; | 82 | }; |
83 | static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; | 83 | static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; |
84 | 84 | ||
85 | static inline void *ptr_to_indirect(void *ptr) | ||
86 | { | ||
87 | return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR); | ||
88 | } | ||
89 | |||
90 | static inline void *indirect_to_ptr(void *ptr) | ||
91 | { | ||
92 | return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR); | ||
93 | } | ||
94 | |||
85 | static inline gfp_t root_gfp_mask(struct radix_tree_root *root) | 95 | static inline gfp_t root_gfp_mask(struct radix_tree_root *root) |
86 | { | 96 | { |
87 | return root->gfp_mask & __GFP_BITS_MASK; | 97 | return root->gfp_mask & __GFP_BITS_MASK; |
@@ -265,7 +275,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) | |||
265 | return -ENOMEM; | 275 | return -ENOMEM; |
266 | 276 | ||
267 | /* Increase the height. */ | 277 | /* Increase the height. */ |
268 | node->slots[0] = radix_tree_indirect_to_ptr(root->rnode); | 278 | node->slots[0] = indirect_to_ptr(root->rnode); |
269 | 279 | ||
270 | /* Propagate the aggregated tag info into the new root */ | 280 | /* Propagate the aggregated tag info into the new root */ |
271 | for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { | 281 | for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { |
@@ -276,7 +286,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) | |||
276 | newheight = root->height+1; | 286 | newheight = root->height+1; |
277 | node->height = newheight; | 287 | node->height = newheight; |
278 | node->count = 1; | 288 | node->count = 1; |
279 | node = radix_tree_ptr_to_indirect(node); | 289 | node = ptr_to_indirect(node); |
280 | rcu_assign_pointer(root->rnode, node); | 290 | rcu_assign_pointer(root->rnode, node); |
281 | root->height = newheight; | 291 | root->height = newheight; |
282 | } while (height > root->height); | 292 | } while (height > root->height); |
@@ -309,7 +319,7 @@ int radix_tree_insert(struct radix_tree_root *root, | |||
309 | return error; | 319 | return error; |
310 | } | 320 | } |
311 | 321 | ||
312 | slot = radix_tree_indirect_to_ptr(root->rnode); | 322 | slot = indirect_to_ptr(root->rnode); |
313 | 323 | ||
314 | height = root->height; | 324 | height = root->height; |
315 | shift = (height-1) * RADIX_TREE_MAP_SHIFT; | 325 | shift = (height-1) * RADIX_TREE_MAP_SHIFT; |
@@ -325,8 +335,7 @@ int radix_tree_insert(struct radix_tree_root *root, | |||
325 | rcu_assign_pointer(node->slots[offset], slot); | 335 | rcu_assign_pointer(node->slots[offset], slot); |
326 | node->count++; | 336 | node->count++; |
327 | } else | 337 | } else |
328 | rcu_assign_pointer(root->rnode, | 338 | rcu_assign_pointer(root->rnode, ptr_to_indirect(slot)); |
329 | radix_tree_ptr_to_indirect(slot)); | ||
330 | } | 339 | } |
331 | 340 | ||
332 | /* Go a level down */ | 341 | /* Go a level down */ |
@@ -374,7 +383,7 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root, | |||
374 | return NULL; | 383 | return NULL; |
375 | return is_slot ? (void *)&root->rnode : node; | 384 | return is_slot ? (void *)&root->rnode : node; |
376 | } | 385 | } |
377 | node = radix_tree_indirect_to_ptr(node); | 386 | node = indirect_to_ptr(node); |
378 | 387 | ||
379 | height = node->height; | 388 | height = node->height; |
380 | if (index > radix_tree_maxindex(height)) | 389 | if (index > radix_tree_maxindex(height)) |
@@ -393,7 +402,7 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root, | |||
393 | height--; | 402 | height--; |
394 | } while (height > 0); | 403 | } while (height > 0); |
395 | 404 | ||
396 | return is_slot ? (void *)slot:node; | 405 | return is_slot ? (void *)slot : indirect_to_ptr(node); |
397 | } | 406 | } |
398 | 407 | ||
399 | /** | 408 | /** |
@@ -455,7 +464,7 @@ void *radix_tree_tag_set(struct radix_tree_root *root, | |||
455 | height = root->height; | 464 | height = root->height; |
456 | BUG_ON(index > radix_tree_maxindex(height)); | 465 | BUG_ON(index > radix_tree_maxindex(height)); |
457 | 466 | ||
458 | slot = radix_tree_indirect_to_ptr(root->rnode); | 467 | slot = indirect_to_ptr(root->rnode); |
459 | shift = (height - 1) * RADIX_TREE_MAP_SHIFT; | 468 | shift = (height - 1) * RADIX_TREE_MAP_SHIFT; |
460 | 469 | ||
461 | while (height > 0) { | 470 | while (height > 0) { |
@@ -509,7 +518,7 @@ void *radix_tree_tag_clear(struct radix_tree_root *root, | |||
509 | 518 | ||
510 | shift = (height - 1) * RADIX_TREE_MAP_SHIFT; | 519 | shift = (height - 1) * RADIX_TREE_MAP_SHIFT; |
511 | pathp->node = NULL; | 520 | pathp->node = NULL; |
512 | slot = radix_tree_indirect_to_ptr(root->rnode); | 521 | slot = indirect_to_ptr(root->rnode); |
513 | 522 | ||
514 | while (height > 0) { | 523 | while (height > 0) { |
515 | int offset; | 524 | int offset; |
@@ -579,7 +588,7 @@ int radix_tree_tag_get(struct radix_tree_root *root, | |||
579 | 588 | ||
580 | if (!radix_tree_is_indirect_ptr(node)) | 589 | if (!radix_tree_is_indirect_ptr(node)) |
581 | return (index == 0); | 590 | return (index == 0); |
582 | node = radix_tree_indirect_to_ptr(node); | 591 | node = indirect_to_ptr(node); |
583 | 592 | ||
584 | height = node->height; | 593 | height = node->height; |
585 | if (index > radix_tree_maxindex(height)) | 594 | if (index > radix_tree_maxindex(height)) |
@@ -666,7 +675,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, | |||
666 | } | 675 | } |
667 | 676 | ||
668 | shift = (height - 1) * RADIX_TREE_MAP_SHIFT; | 677 | shift = (height - 1) * RADIX_TREE_MAP_SHIFT; |
669 | slot = radix_tree_indirect_to_ptr(root->rnode); | 678 | slot = indirect_to_ptr(root->rnode); |
670 | 679 | ||
671 | /* | 680 | /* |
672 | * we fill the path from (root->height - 2) to 0, leaving the index at | 681 | * we fill the path from (root->height - 2) to 0, leaving the index at |
@@ -897,7 +906,7 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results, | |||
897 | results[0] = node; | 906 | results[0] = node; |
898 | return 1; | 907 | return 1; |
899 | } | 908 | } |
900 | node = radix_tree_indirect_to_ptr(node); | 909 | node = indirect_to_ptr(node); |
901 | 910 | ||
902 | max_index = radix_tree_maxindex(node->height); | 911 | max_index = radix_tree_maxindex(node->height); |
903 | 912 | ||
@@ -916,7 +925,8 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results, | |||
916 | slot = *(((void ***)results)[ret + i]); | 925 | slot = *(((void ***)results)[ret + i]); |
917 | if (!slot) | 926 | if (!slot) |
918 | continue; | 927 | continue; |
919 | results[ret + nr_found] = rcu_dereference_raw(slot); | 928 | results[ret + nr_found] = |
929 | indirect_to_ptr(rcu_dereference_raw(slot)); | ||
920 | nr_found++; | 930 | nr_found++; |
921 | } | 931 | } |
922 | ret += nr_found; | 932 | ret += nr_found; |
@@ -965,7 +975,7 @@ radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, | |||
965 | results[0] = (void **)&root->rnode; | 975 | results[0] = (void **)&root->rnode; |
966 | return 1; | 976 | return 1; |
967 | } | 977 | } |
968 | node = radix_tree_indirect_to_ptr(node); | 978 | node = indirect_to_ptr(node); |
969 | 979 | ||
970 | max_index = radix_tree_maxindex(node->height); | 980 | max_index = radix_tree_maxindex(node->height); |
971 | 981 | ||
@@ -1090,7 +1100,7 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, | |||
1090 | results[0] = node; | 1100 | results[0] = node; |
1091 | return 1; | 1101 | return 1; |
1092 | } | 1102 | } |
1093 | node = radix_tree_indirect_to_ptr(node); | 1103 | node = indirect_to_ptr(node); |
1094 | 1104 | ||
1095 | max_index = radix_tree_maxindex(node->height); | 1105 | max_index = radix_tree_maxindex(node->height); |
1096 | 1106 | ||
@@ -1109,7 +1119,8 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, | |||
1109 | slot = *(((void ***)results)[ret + i]); | 1119 | slot = *(((void ***)results)[ret + i]); |
1110 | if (!slot) | 1120 | if (!slot) |
1111 | continue; | 1121 | continue; |
1112 | results[ret + nr_found] = rcu_dereference_raw(slot); | 1122 | results[ret + nr_found] = |
1123 | indirect_to_ptr(rcu_dereference_raw(slot)); | ||
1113 | nr_found++; | 1124 | nr_found++; |
1114 | } | 1125 | } |
1115 | ret += nr_found; | 1126 | ret += nr_found; |
@@ -1159,7 +1170,7 @@ radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results, | |||
1159 | results[0] = (void **)&root->rnode; | 1170 | results[0] = (void **)&root->rnode; |
1160 | return 1; | 1171 | return 1; |
1161 | } | 1172 | } |
1162 | node = radix_tree_indirect_to_ptr(node); | 1173 | node = indirect_to_ptr(node); |
1163 | 1174 | ||
1164 | max_index = radix_tree_maxindex(node->height); | 1175 | max_index = radix_tree_maxindex(node->height); |
1165 | 1176 | ||
@@ -1195,7 +1206,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) | |||
1195 | void *newptr; | 1206 | void *newptr; |
1196 | 1207 | ||
1197 | BUG_ON(!radix_tree_is_indirect_ptr(to_free)); | 1208 | BUG_ON(!radix_tree_is_indirect_ptr(to_free)); |
1198 | to_free = radix_tree_indirect_to_ptr(to_free); | 1209 | to_free = indirect_to_ptr(to_free); |
1199 | 1210 | ||
1200 | /* | 1211 | /* |
1201 | * The candidate node has more than one child, or its child | 1212 | * The candidate node has more than one child, or its child |
@@ -1208,16 +1219,39 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) | |||
1208 | 1219 | ||
1209 | /* | 1220 | /* |
1210 | * We don't need rcu_assign_pointer(), since we are simply | 1221 | * We don't need rcu_assign_pointer(), since we are simply |
1211 | * moving the node from one part of the tree to another. If | 1222 | * moving the node from one part of the tree to another: if it |
1212 | * it was safe to dereference the old pointer to it | 1223 | * was safe to dereference the old pointer to it |
1213 | * (to_free->slots[0]), it will be safe to dereference the new | 1224 | * (to_free->slots[0]), it will be safe to dereference the new |
1214 | * one (root->rnode). | 1225 | * one (root->rnode) as far as dependent read barriers go. |
1215 | */ | 1226 | */ |
1216 | newptr = to_free->slots[0]; | 1227 | newptr = to_free->slots[0]; |
1217 | if (root->height > 1) | 1228 | if (root->height > 1) |
1218 | newptr = radix_tree_ptr_to_indirect(newptr); | 1229 | newptr = ptr_to_indirect(newptr); |
1219 | root->rnode = newptr; | 1230 | root->rnode = newptr; |
1220 | root->height--; | 1231 | root->height--; |
1232 | |||
1233 | /* | ||
1234 | * We have a dilemma here. The node's slot[0] must not be | ||
1235 | * NULLed in case there are concurrent lookups expecting to | ||
1236 | * find the item. However if this was a bottom-level node, | ||
1237 | * then it may be subject to the slot pointer being visible | ||
1238 | * to callers dereferencing it. If item corresponding to | ||
1239 | * slot[0] is subsequently deleted, these callers would expect | ||
1240 | * their slot to become empty sooner or later. | ||
1241 | * | ||
1242 | * For example, lockless pagecache will look up a slot, deref | ||
1243 | * the page pointer, and if the page is 0 refcount it means it | ||
1244 | * was concurrently deleted from pagecache so try the deref | ||
1245 | * again. Fortunately there is already a requirement for logic | ||
1246 | * to retry the entire slot lookup -- the indirect pointer | ||
1247 | * problem (replacing direct root node with an indirect pointer | ||
1248 | * also results in a stale slot). So tag the slot as indirect | ||
1249 | * to force callers to retry. | ||
1250 | */ | ||
1251 | if (root->height == 0) | ||
1252 | *((unsigned long *)&to_free->slots[0]) |= | ||
1253 | RADIX_TREE_INDIRECT_PTR; | ||
1254 | |||
1221 | radix_tree_node_free(to_free); | 1255 | radix_tree_node_free(to_free); |
1222 | } | 1256 | } |
1223 | } | 1257 | } |
@@ -1254,7 +1288,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) | |||
1254 | root->rnode = NULL; | 1288 | root->rnode = NULL; |
1255 | goto out; | 1289 | goto out; |
1256 | } | 1290 | } |
1257 | slot = radix_tree_indirect_to_ptr(slot); | 1291 | slot = indirect_to_ptr(slot); |
1258 | 1292 | ||
1259 | shift = (height - 1) * RADIX_TREE_MAP_SHIFT; | 1293 | shift = (height - 1) * RADIX_TREE_MAP_SHIFT; |
1260 | pathp->node = NULL; | 1294 | pathp->node = NULL; |
@@ -1296,8 +1330,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) | |||
1296 | radix_tree_node_free(to_free); | 1330 | radix_tree_node_free(to_free); |
1297 | 1331 | ||
1298 | if (pathp->node->count) { | 1332 | if (pathp->node->count) { |
1299 | if (pathp->node == | 1333 | if (pathp->node == indirect_to_ptr(root->rnode)) |
1300 | radix_tree_indirect_to_ptr(root->rnode)) | ||
1301 | radix_tree_shrink(root); | 1334 | radix_tree_shrink(root); |
1302 | goto out; | 1335 | goto out; |
1303 | } | 1336 | } |
diff --git a/mm/filemap.c b/mm/filemap.c index 4ee2e998e937..ea89840fc65f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -644,7 +644,9 @@ repeat: | |||
644 | pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); | 644 | pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); |
645 | if (pagep) { | 645 | if (pagep) { |
646 | page = radix_tree_deref_slot(pagep); | 646 | page = radix_tree_deref_slot(pagep); |
647 | if (unlikely(!page || page == RADIX_TREE_RETRY)) | 647 | if (unlikely(!page)) |
648 | goto out; | ||
649 | if (radix_tree_deref_retry(page)) | ||
648 | goto repeat; | 650 | goto repeat; |
649 | 651 | ||
650 | if (!page_cache_get_speculative(page)) | 652 | if (!page_cache_get_speculative(page)) |
@@ -660,6 +662,7 @@ repeat: | |||
660 | goto repeat; | 662 | goto repeat; |
661 | } | 663 | } |
662 | } | 664 | } |
665 | out: | ||
663 | rcu_read_unlock(); | 666 | rcu_read_unlock(); |
664 | 667 | ||
665 | return page; | 668 | return page; |
@@ -777,12 +780,11 @@ repeat: | |||
777 | page = radix_tree_deref_slot((void **)pages[i]); | 780 | page = radix_tree_deref_slot((void **)pages[i]); |
778 | if (unlikely(!page)) | 781 | if (unlikely(!page)) |
779 | continue; | 782 | continue; |
780 | /* | 783 | if (radix_tree_deref_retry(page)) { |
781 | * this can only trigger if nr_found == 1, making livelock | 784 | if (ret) |
782 | * a non issue. | 785 | start = pages[ret-1]->index; |
783 | */ | ||
784 | if (unlikely(page == RADIX_TREE_RETRY)) | ||
785 | goto restart; | 786 | goto restart; |
787 | } | ||
786 | 788 | ||
787 | if (!page_cache_get_speculative(page)) | 789 | if (!page_cache_get_speculative(page)) |
788 | goto repeat; | 790 | goto repeat; |
@@ -830,11 +832,7 @@ repeat: | |||
830 | page = radix_tree_deref_slot((void **)pages[i]); | 832 | page = radix_tree_deref_slot((void **)pages[i]); |
831 | if (unlikely(!page)) | 833 | if (unlikely(!page)) |
832 | continue; | 834 | continue; |
833 | /* | 835 | if (radix_tree_deref_retry(page)) |
834 | * this can only trigger if nr_found == 1, making livelock | ||
835 | * a non issue. | ||
836 | */ | ||
837 | if (unlikely(page == RADIX_TREE_RETRY)) | ||
838 | goto restart; | 836 | goto restart; |
839 | 837 | ||
840 | if (page->mapping == NULL || page->index != index) | 838 | if (page->mapping == NULL || page->index != index) |
@@ -887,11 +885,7 @@ repeat: | |||
887 | page = radix_tree_deref_slot((void **)pages[i]); | 885 | page = radix_tree_deref_slot((void **)pages[i]); |
888 | if (unlikely(!page)) | 886 | if (unlikely(!page)) |
889 | continue; | 887 | continue; |
890 | /* | 888 | if (radix_tree_deref_retry(page)) |
891 | * this can only trigger if nr_found == 1, making livelock | ||
892 | * a non issue. | ||
893 | */ | ||
894 | if (unlikely(page == RADIX_TREE_RETRY)) | ||
895 | goto restart; | 889 | goto restart; |
896 | 890 | ||
897 | if (!page_cache_get_speculative(page)) | 891 | if (!page_cache_get_speculative(page)) |