aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNick Piggin <npiggin@kernel.dk>2010-11-11 17:05:19 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2010-11-12 10:55:32 -0500
commit27d20fddc8af539464fc3ba499d6a830054c3bd6 (patch)
tree23514cfe88f90150a8635c47586a8a378fb905e3
parenteaf06b241b091357e72b76863ba16e89610d31bd (diff)
radix-tree: fix RCU bug
Salman Qazi describes the following radix-tree bug: In the following case, we get can get a deadlock: 0. The radix tree contains two items, one has the index 0. 1. The reader (in this case find_get_pages) takes the rcu_read_lock. 2. The reader acquires slot(s) for item(s) including the index 0 item. 3. The non-zero index item is deleted, and as a consequence the other item is moved to the root of the tree. The place where it used to be is queued for deletion after the readers finish. 3b. The zero item is deleted, removing it from the direct slot, it remains in the rcu-delayed indirect node. 4. The reader looks at the index 0 slot, and finds that the page has 0 ref count 5. The reader looks at it again, hoping that the item will either be freed or the ref count will increase. This never happens, as the slot it is looking at will never be updated. Also, this slot can never be reclaimed because the reader is holding rcu_read_lock and is in an infinite loop. The fix is to re-use the same "indirect" pointer case that requires a slot lookup retry into a general "retry the lookup" bit. Signed-off-by: Nick Piggin <npiggin@kernel.dk> Reported-by: Salman Qazi <sqazi@google.com> Cc: <stable@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/radix-tree.h39
-rw-r--r--lib/radix-tree.c83
-rw-r--r--mm/filemap.c26
3 files changed, 91 insertions, 57 deletions
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index a39cbed9ee17..ab2baa5c4884 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -34,19 +34,13 @@
34 * needed for RCU lookups (because root->height is unreliable). The only 34 * needed for RCU lookups (because root->height is unreliable). The only
35 * time callers need worry about this is when doing a lookup_slot under 35 * time callers need worry about this is when doing a lookup_slot under
36 * RCU. 36 * RCU.
37 *
38 * Indirect pointer in fact is also used to tag the last pointer of a node
39 * when it is shrunk, before we rcu free the node. See shrink code for
40 * details.
37 */ 41 */
38#define RADIX_TREE_INDIRECT_PTR 1 42#define RADIX_TREE_INDIRECT_PTR 1
39#define RADIX_TREE_RETRY ((void *)-1UL)
40
41static inline void *radix_tree_ptr_to_indirect(void *ptr)
42{
43 return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR);
44}
45 43
46static inline void *radix_tree_indirect_to_ptr(void *ptr)
47{
48 return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR);
49}
50#define radix_tree_indirect_to_ptr(ptr) \ 44#define radix_tree_indirect_to_ptr(ptr) \
51 radix_tree_indirect_to_ptr((void __force *)(ptr)) 45 radix_tree_indirect_to_ptr((void __force *)(ptr))
52 46
@@ -140,16 +134,29 @@ do { \
140 * removed. 134 * removed.
141 * 135 *
142 * For use with radix_tree_lookup_slot(). Caller must hold tree at least read 136 * For use with radix_tree_lookup_slot(). Caller must hold tree at least read
143 * locked across slot lookup and dereference. More likely, will be used with 137 * locked across slot lookup and dereference. Not required if write lock is
144 * radix_tree_replace_slot(), as well, so caller will hold tree write locked. 138 * held (ie. items cannot be concurrently inserted).
139 *
140 * radix_tree_deref_retry must be used to confirm validity of the pointer if
141 * only the read lock is held.
145 */ 142 */
146static inline void *radix_tree_deref_slot(void **pslot) 143static inline void *radix_tree_deref_slot(void **pslot)
147{ 144{
148 void *ret = rcu_dereference(*pslot); 145 return rcu_dereference(*pslot);
149 if (unlikely(radix_tree_is_indirect_ptr(ret)))
150 ret = RADIX_TREE_RETRY;
151 return ret;
152} 146}
147
148/**
149 * radix_tree_deref_retry - check radix_tree_deref_slot
150 * @arg: pointer returned by radix_tree_deref_slot
151 * Returns: 0 if retry is not required, otherwise retry is required
152 *
153 * radix_tree_deref_retry must be used with radix_tree_deref_slot.
154 */
155static inline int radix_tree_deref_retry(void *arg)
156{
157 return unlikely((unsigned long)arg & RADIX_TREE_INDIRECT_PTR);
158}
159
153/** 160/**
154 * radix_tree_replace_slot - replace item in a slot 161 * radix_tree_replace_slot - replace item in a slot
155 * @pslot: pointer to slot, returned by radix_tree_lookup_slot 162 * @pslot: pointer to slot, returned by radix_tree_lookup_slot
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 6f412ab4c24f..5086bb962b4d 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -82,6 +82,16 @@ struct radix_tree_preload {
82}; 82};
83static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; 83static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
84 84
85static inline void *ptr_to_indirect(void *ptr)
86{
87 return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR);
88}
89
90static inline void *indirect_to_ptr(void *ptr)
91{
92 return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR);
93}
94
85static inline gfp_t root_gfp_mask(struct radix_tree_root *root) 95static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
86{ 96{
87 return root->gfp_mask & __GFP_BITS_MASK; 97 return root->gfp_mask & __GFP_BITS_MASK;
@@ -265,7 +275,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
265 return -ENOMEM; 275 return -ENOMEM;
266 276
267 /* Increase the height. */ 277 /* Increase the height. */
268 node->slots[0] = radix_tree_indirect_to_ptr(root->rnode); 278 node->slots[0] = indirect_to_ptr(root->rnode);
269 279
270 /* Propagate the aggregated tag info into the new root */ 280 /* Propagate the aggregated tag info into the new root */
271 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { 281 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
@@ -276,7 +286,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
276 newheight = root->height+1; 286 newheight = root->height+1;
277 node->height = newheight; 287 node->height = newheight;
278 node->count = 1; 288 node->count = 1;
279 node = radix_tree_ptr_to_indirect(node); 289 node = ptr_to_indirect(node);
280 rcu_assign_pointer(root->rnode, node); 290 rcu_assign_pointer(root->rnode, node);
281 root->height = newheight; 291 root->height = newheight;
282 } while (height > root->height); 292 } while (height > root->height);
@@ -309,7 +319,7 @@ int radix_tree_insert(struct radix_tree_root *root,
309 return error; 319 return error;
310 } 320 }
311 321
312 slot = radix_tree_indirect_to_ptr(root->rnode); 322 slot = indirect_to_ptr(root->rnode);
313 323
314 height = root->height; 324 height = root->height;
315 shift = (height-1) * RADIX_TREE_MAP_SHIFT; 325 shift = (height-1) * RADIX_TREE_MAP_SHIFT;
@@ -325,8 +335,7 @@ int radix_tree_insert(struct radix_tree_root *root,
325 rcu_assign_pointer(node->slots[offset], slot); 335 rcu_assign_pointer(node->slots[offset], slot);
326 node->count++; 336 node->count++;
327 } else 337 } else
328 rcu_assign_pointer(root->rnode, 338 rcu_assign_pointer(root->rnode, ptr_to_indirect(slot));
329 radix_tree_ptr_to_indirect(slot));
330 } 339 }
331 340
332 /* Go a level down */ 341 /* Go a level down */
@@ -374,7 +383,7 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root,
374 return NULL; 383 return NULL;
375 return is_slot ? (void *)&root->rnode : node; 384 return is_slot ? (void *)&root->rnode : node;
376 } 385 }
377 node = radix_tree_indirect_to_ptr(node); 386 node = indirect_to_ptr(node);
378 387
379 height = node->height; 388 height = node->height;
380 if (index > radix_tree_maxindex(height)) 389 if (index > radix_tree_maxindex(height))
@@ -393,7 +402,7 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root,
393 height--; 402 height--;
394 } while (height > 0); 403 } while (height > 0);
395 404
396 return is_slot ? (void *)slot:node; 405 return is_slot ? (void *)slot : indirect_to_ptr(node);
397} 406}
398 407
399/** 408/**
@@ -455,7 +464,7 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
455 height = root->height; 464 height = root->height;
456 BUG_ON(index > radix_tree_maxindex(height)); 465 BUG_ON(index > radix_tree_maxindex(height));
457 466
458 slot = radix_tree_indirect_to_ptr(root->rnode); 467 slot = indirect_to_ptr(root->rnode);
459 shift = (height - 1) * RADIX_TREE_MAP_SHIFT; 468 shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
460 469
461 while (height > 0) { 470 while (height > 0) {
@@ -509,7 +518,7 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
509 518
510 shift = (height - 1) * RADIX_TREE_MAP_SHIFT; 519 shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
511 pathp->node = NULL; 520 pathp->node = NULL;
512 slot = radix_tree_indirect_to_ptr(root->rnode); 521 slot = indirect_to_ptr(root->rnode);
513 522
514 while (height > 0) { 523 while (height > 0) {
515 int offset; 524 int offset;
@@ -579,7 +588,7 @@ int radix_tree_tag_get(struct radix_tree_root *root,
579 588
580 if (!radix_tree_is_indirect_ptr(node)) 589 if (!radix_tree_is_indirect_ptr(node))
581 return (index == 0); 590 return (index == 0);
582 node = radix_tree_indirect_to_ptr(node); 591 node = indirect_to_ptr(node);
583 592
584 height = node->height; 593 height = node->height;
585 if (index > radix_tree_maxindex(height)) 594 if (index > radix_tree_maxindex(height))
@@ -666,7 +675,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
666 } 675 }
667 676
668 shift = (height - 1) * RADIX_TREE_MAP_SHIFT; 677 shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
669 slot = radix_tree_indirect_to_ptr(root->rnode); 678 slot = indirect_to_ptr(root->rnode);
670 679
671 /* 680 /*
672 * we fill the path from (root->height - 2) to 0, leaving the index at 681 * we fill the path from (root->height - 2) to 0, leaving the index at
@@ -897,7 +906,7 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
897 results[0] = node; 906 results[0] = node;
898 return 1; 907 return 1;
899 } 908 }
900 node = radix_tree_indirect_to_ptr(node); 909 node = indirect_to_ptr(node);
901 910
902 max_index = radix_tree_maxindex(node->height); 911 max_index = radix_tree_maxindex(node->height);
903 912
@@ -916,7 +925,8 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
916 slot = *(((void ***)results)[ret + i]); 925 slot = *(((void ***)results)[ret + i]);
917 if (!slot) 926 if (!slot)
918 continue; 927 continue;
919 results[ret + nr_found] = rcu_dereference_raw(slot); 928 results[ret + nr_found] =
929 indirect_to_ptr(rcu_dereference_raw(slot));
920 nr_found++; 930 nr_found++;
921 } 931 }
922 ret += nr_found; 932 ret += nr_found;
@@ -965,7 +975,7 @@ radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results,
965 results[0] = (void **)&root->rnode; 975 results[0] = (void **)&root->rnode;
966 return 1; 976 return 1;
967 } 977 }
968 node = radix_tree_indirect_to_ptr(node); 978 node = indirect_to_ptr(node);
969 979
970 max_index = radix_tree_maxindex(node->height); 980 max_index = radix_tree_maxindex(node->height);
971 981
@@ -1090,7 +1100,7 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
1090 results[0] = node; 1100 results[0] = node;
1091 return 1; 1101 return 1;
1092 } 1102 }
1093 node = radix_tree_indirect_to_ptr(node); 1103 node = indirect_to_ptr(node);
1094 1104
1095 max_index = radix_tree_maxindex(node->height); 1105 max_index = radix_tree_maxindex(node->height);
1096 1106
@@ -1109,7 +1119,8 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
1109 slot = *(((void ***)results)[ret + i]); 1119 slot = *(((void ***)results)[ret + i]);
1110 if (!slot) 1120 if (!slot)
1111 continue; 1121 continue;
1112 results[ret + nr_found] = rcu_dereference_raw(slot); 1122 results[ret + nr_found] =
1123 indirect_to_ptr(rcu_dereference_raw(slot));
1113 nr_found++; 1124 nr_found++;
1114 } 1125 }
1115 ret += nr_found; 1126 ret += nr_found;
@@ -1159,7 +1170,7 @@ radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
1159 results[0] = (void **)&root->rnode; 1170 results[0] = (void **)&root->rnode;
1160 return 1; 1171 return 1;
1161 } 1172 }
1162 node = radix_tree_indirect_to_ptr(node); 1173 node = indirect_to_ptr(node);
1163 1174
1164 max_index = radix_tree_maxindex(node->height); 1175 max_index = radix_tree_maxindex(node->height);
1165 1176
@@ -1195,7 +1206,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
1195 void *newptr; 1206 void *newptr;
1196 1207
1197 BUG_ON(!radix_tree_is_indirect_ptr(to_free)); 1208 BUG_ON(!radix_tree_is_indirect_ptr(to_free));
1198 to_free = radix_tree_indirect_to_ptr(to_free); 1209 to_free = indirect_to_ptr(to_free);
1199 1210
1200 /* 1211 /*
1201 * The candidate node has more than one child, or its child 1212 * The candidate node has more than one child, or its child
@@ -1208,16 +1219,39 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
1208 1219
1209 /* 1220 /*
1210 * We don't need rcu_assign_pointer(), since we are simply 1221 * We don't need rcu_assign_pointer(), since we are simply
1211 * moving the node from one part of the tree to another. If 1222 * moving the node from one part of the tree to another: if it
1212 * it was safe to dereference the old pointer to it 1223 * was safe to dereference the old pointer to it
1213 * (to_free->slots[0]), it will be safe to dereference the new 1224 * (to_free->slots[0]), it will be safe to dereference the new
1214 * one (root->rnode). 1225 * one (root->rnode) as far as dependent read barriers go.
1215 */ 1226 */
1216 newptr = to_free->slots[0]; 1227 newptr = to_free->slots[0];
1217 if (root->height > 1) 1228 if (root->height > 1)
1218 newptr = radix_tree_ptr_to_indirect(newptr); 1229 newptr = ptr_to_indirect(newptr);
1219 root->rnode = newptr; 1230 root->rnode = newptr;
1220 root->height--; 1231 root->height--;
1232
1233 /*
1234 * We have a dilemma here. The node's slot[0] must not be
1235 * NULLed in case there are concurrent lookups expecting to
1236 * find the item. However if this was a bottom-level node,
1237 * then it may be subject to the slot pointer being visible
1238 * to callers dereferencing it. If item corresponding to
1239 * slot[0] is subsequently deleted, these callers would expect
1240 * their slot to become empty sooner or later.
1241 *
1242 * For example, lockless pagecache will look up a slot, deref
1243 * the page pointer, and if the page is 0 refcount it means it
1244 * was concurrently deleted from pagecache so try the deref
1245 * again. Fortunately there is already a requirement for logic
1246 * to retry the entire slot lookup -- the indirect pointer
1247 * problem (replacing direct root node with an indirect pointer
1248 * also results in a stale slot). So tag the slot as indirect
1249 * to force callers to retry.
1250 */
1251 if (root->height == 0)
1252 *((unsigned long *)&to_free->slots[0]) |=
1253 RADIX_TREE_INDIRECT_PTR;
1254
1221 radix_tree_node_free(to_free); 1255 radix_tree_node_free(to_free);
1222 } 1256 }
1223} 1257}
@@ -1254,7 +1288,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
1254 root->rnode = NULL; 1288 root->rnode = NULL;
1255 goto out; 1289 goto out;
1256 } 1290 }
1257 slot = radix_tree_indirect_to_ptr(slot); 1291 slot = indirect_to_ptr(slot);
1258 1292
1259 shift = (height - 1) * RADIX_TREE_MAP_SHIFT; 1293 shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
1260 pathp->node = NULL; 1294 pathp->node = NULL;
@@ -1296,8 +1330,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
1296 radix_tree_node_free(to_free); 1330 radix_tree_node_free(to_free);
1297 1331
1298 if (pathp->node->count) { 1332 if (pathp->node->count) {
1299 if (pathp->node == 1333 if (pathp->node == indirect_to_ptr(root->rnode))
1300 radix_tree_indirect_to_ptr(root->rnode))
1301 radix_tree_shrink(root); 1334 radix_tree_shrink(root);
1302 goto out; 1335 goto out;
1303 } 1336 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 4ee2e998e937..ea89840fc65f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -644,7 +644,9 @@ repeat:
644 pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); 644 pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
645 if (pagep) { 645 if (pagep) {
646 page = radix_tree_deref_slot(pagep); 646 page = radix_tree_deref_slot(pagep);
647 if (unlikely(!page || page == RADIX_TREE_RETRY)) 647 if (unlikely(!page))
648 goto out;
649 if (radix_tree_deref_retry(page))
648 goto repeat; 650 goto repeat;
649 651
650 if (!page_cache_get_speculative(page)) 652 if (!page_cache_get_speculative(page))
@@ -660,6 +662,7 @@ repeat:
660 goto repeat; 662 goto repeat;
661 } 663 }
662 } 664 }
665out:
663 rcu_read_unlock(); 666 rcu_read_unlock();
664 667
665 return page; 668 return page;
@@ -777,12 +780,11 @@ repeat:
777 page = radix_tree_deref_slot((void **)pages[i]); 780 page = radix_tree_deref_slot((void **)pages[i]);
778 if (unlikely(!page)) 781 if (unlikely(!page))
779 continue; 782 continue;
780 /* 783 if (radix_tree_deref_retry(page)) {
781 * this can only trigger if nr_found == 1, making livelock 784 if (ret)
782 * a non issue. 785 start = pages[ret-1]->index;
783 */
784 if (unlikely(page == RADIX_TREE_RETRY))
785 goto restart; 786 goto restart;
787 }
786 788
787 if (!page_cache_get_speculative(page)) 789 if (!page_cache_get_speculative(page))
788 goto repeat; 790 goto repeat;
@@ -830,11 +832,7 @@ repeat:
830 page = radix_tree_deref_slot((void **)pages[i]); 832 page = radix_tree_deref_slot((void **)pages[i]);
831 if (unlikely(!page)) 833 if (unlikely(!page))
832 continue; 834 continue;
833 /* 835 if (radix_tree_deref_retry(page))
834 * this can only trigger if nr_found == 1, making livelock
835 * a non issue.
836 */
837 if (unlikely(page == RADIX_TREE_RETRY))
838 goto restart; 836 goto restart;
839 837
840 if (page->mapping == NULL || page->index != index) 838 if (page->mapping == NULL || page->index != index)
@@ -887,11 +885,7 @@ repeat:
887 page = radix_tree_deref_slot((void **)pages[i]); 885 page = radix_tree_deref_slot((void **)pages[i]);
888 if (unlikely(!page)) 886 if (unlikely(!page))
889 continue; 887 continue;
890 /* 888 if (radix_tree_deref_retry(page))
891 * this can only trigger if nr_found == 1, making livelock
892 * a non issue.
893 */
894 if (unlikely(page == RADIX_TREE_RETRY))
895 goto restart; 889 goto restart;
896 890
897 if (!page_cache_get_speculative(page)) 891 if (!page_cache_get_speculative(page))