aboutsummaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
authorNick Piggin <npiggin@kernel.dk>2010-11-11 17:05:19 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2010-11-12 10:55:32 -0500
commit27d20fddc8af539464fc3ba499d6a830054c3bd6 (patch)
tree23514cfe88f90150a8635c47586a8a378fb905e3 /lib
parenteaf06b241b091357e72b76863ba16e89610d31bd (diff)
radix-tree: fix RCU bug
Salman Qazi describes the following radix-tree bug: In the following case, we get can get a deadlock: 0. The radix tree contains two items, one has the index 0. 1. The reader (in this case find_get_pages) takes the rcu_read_lock. 2. The reader acquires slot(s) for item(s) including the index 0 item. 3. The non-zero index item is deleted, and as a consequence the other item is moved to the root of the tree. The place where it used to be is queued for deletion after the readers finish. 3b. The zero item is deleted, removing it from the direct slot, it remains in the rcu-delayed indirect node. 4. The reader looks at the index 0 slot, and finds that the page has 0 ref count 5. The reader looks at it again, hoping that the item will either be freed or the ref count will increase. This never happens, as the slot it is looking at will never be updated. Also, this slot can never be reclaimed because the reader is holding rcu_read_lock and is in an infinite loop. The fix is to re-use the same "indirect" pointer case that requires a slot lookup retry into a general "retry the lookup" bit. Signed-off-by: Nick Piggin <npiggin@kernel.dk> Reported-by: Salman Qazi <sqazi@google.com> Cc: <stable@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'lib')
-rw-r--r--lib/radix-tree.c83
1 files changed, 58 insertions, 25 deletions
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 6f412ab4c24f..5086bb962b4d 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -82,6 +82,16 @@ struct radix_tree_preload {
82}; 82};
83static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; 83static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
84 84
85static inline void *ptr_to_indirect(void *ptr)
86{
87 return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR);
88}
89
90static inline void *indirect_to_ptr(void *ptr)
91{
92 return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR);
93}
94
85static inline gfp_t root_gfp_mask(struct radix_tree_root *root) 95static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
86{ 96{
87 return root->gfp_mask & __GFP_BITS_MASK; 97 return root->gfp_mask & __GFP_BITS_MASK;
@@ -265,7 +275,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
265 return -ENOMEM; 275 return -ENOMEM;
266 276
267 /* Increase the height. */ 277 /* Increase the height. */
268 node->slots[0] = radix_tree_indirect_to_ptr(root->rnode); 278 node->slots[0] = indirect_to_ptr(root->rnode);
269 279
270 /* Propagate the aggregated tag info into the new root */ 280 /* Propagate the aggregated tag info into the new root */
271 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { 281 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
@@ -276,7 +286,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
276 newheight = root->height+1; 286 newheight = root->height+1;
277 node->height = newheight; 287 node->height = newheight;
278 node->count = 1; 288 node->count = 1;
279 node = radix_tree_ptr_to_indirect(node); 289 node = ptr_to_indirect(node);
280 rcu_assign_pointer(root->rnode, node); 290 rcu_assign_pointer(root->rnode, node);
281 root->height = newheight; 291 root->height = newheight;
282 } while (height > root->height); 292 } while (height > root->height);
@@ -309,7 +319,7 @@ int radix_tree_insert(struct radix_tree_root *root,
309 return error; 319 return error;
310 } 320 }
311 321
312 slot = radix_tree_indirect_to_ptr(root->rnode); 322 slot = indirect_to_ptr(root->rnode);
313 323
314 height = root->height; 324 height = root->height;
315 shift = (height-1) * RADIX_TREE_MAP_SHIFT; 325 shift = (height-1) * RADIX_TREE_MAP_SHIFT;
@@ -325,8 +335,7 @@ int radix_tree_insert(struct radix_tree_root *root,
325 rcu_assign_pointer(node->slots[offset], slot); 335 rcu_assign_pointer(node->slots[offset], slot);
326 node->count++; 336 node->count++;
327 } else 337 } else
328 rcu_assign_pointer(root->rnode, 338 rcu_assign_pointer(root->rnode, ptr_to_indirect(slot));
329 radix_tree_ptr_to_indirect(slot));
330 } 339 }
331 340
332 /* Go a level down */ 341 /* Go a level down */
@@ -374,7 +383,7 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root,
374 return NULL; 383 return NULL;
375 return is_slot ? (void *)&root->rnode : node; 384 return is_slot ? (void *)&root->rnode : node;
376 } 385 }
377 node = radix_tree_indirect_to_ptr(node); 386 node = indirect_to_ptr(node);
378 387
379 height = node->height; 388 height = node->height;
380 if (index > radix_tree_maxindex(height)) 389 if (index > radix_tree_maxindex(height))
@@ -393,7 +402,7 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root,
393 height--; 402 height--;
394 } while (height > 0); 403 } while (height > 0);
395 404
396 return is_slot ? (void *)slot:node; 405 return is_slot ? (void *)slot : indirect_to_ptr(node);
397} 406}
398 407
399/** 408/**
@@ -455,7 +464,7 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
455 height = root->height; 464 height = root->height;
456 BUG_ON(index > radix_tree_maxindex(height)); 465 BUG_ON(index > radix_tree_maxindex(height));
457 466
458 slot = radix_tree_indirect_to_ptr(root->rnode); 467 slot = indirect_to_ptr(root->rnode);
459 shift = (height - 1) * RADIX_TREE_MAP_SHIFT; 468 shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
460 469
461 while (height > 0) { 470 while (height > 0) {
@@ -509,7 +518,7 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
509 518
510 shift = (height - 1) * RADIX_TREE_MAP_SHIFT; 519 shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
511 pathp->node = NULL; 520 pathp->node = NULL;
512 slot = radix_tree_indirect_to_ptr(root->rnode); 521 slot = indirect_to_ptr(root->rnode);
513 522
514 while (height > 0) { 523 while (height > 0) {
515 int offset; 524 int offset;
@@ -579,7 +588,7 @@ int radix_tree_tag_get(struct radix_tree_root *root,
579 588
580 if (!radix_tree_is_indirect_ptr(node)) 589 if (!radix_tree_is_indirect_ptr(node))
581 return (index == 0); 590 return (index == 0);
582 node = radix_tree_indirect_to_ptr(node); 591 node = indirect_to_ptr(node);
583 592
584 height = node->height; 593 height = node->height;
585 if (index > radix_tree_maxindex(height)) 594 if (index > radix_tree_maxindex(height))
@@ -666,7 +675,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
666 } 675 }
667 676
668 shift = (height - 1) * RADIX_TREE_MAP_SHIFT; 677 shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
669 slot = radix_tree_indirect_to_ptr(root->rnode); 678 slot = indirect_to_ptr(root->rnode);
670 679
671 /* 680 /*
672 * we fill the path from (root->height - 2) to 0, leaving the index at 681 * we fill the path from (root->height - 2) to 0, leaving the index at
@@ -897,7 +906,7 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
897 results[0] = node; 906 results[0] = node;
898 return 1; 907 return 1;
899 } 908 }
900 node = radix_tree_indirect_to_ptr(node); 909 node = indirect_to_ptr(node);
901 910
902 max_index = radix_tree_maxindex(node->height); 911 max_index = radix_tree_maxindex(node->height);
903 912
@@ -916,7 +925,8 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
916 slot = *(((void ***)results)[ret + i]); 925 slot = *(((void ***)results)[ret + i]);
917 if (!slot) 926 if (!slot)
918 continue; 927 continue;
919 results[ret + nr_found] = rcu_dereference_raw(slot); 928 results[ret + nr_found] =
929 indirect_to_ptr(rcu_dereference_raw(slot));
920 nr_found++; 930 nr_found++;
921 } 931 }
922 ret += nr_found; 932 ret += nr_found;
@@ -965,7 +975,7 @@ radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results,
965 results[0] = (void **)&root->rnode; 975 results[0] = (void **)&root->rnode;
966 return 1; 976 return 1;
967 } 977 }
968 node = radix_tree_indirect_to_ptr(node); 978 node = indirect_to_ptr(node);
969 979
970 max_index = radix_tree_maxindex(node->height); 980 max_index = radix_tree_maxindex(node->height);
971 981
@@ -1090,7 +1100,7 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
1090 results[0] = node; 1100 results[0] = node;
1091 return 1; 1101 return 1;
1092 } 1102 }
1093 node = radix_tree_indirect_to_ptr(node); 1103 node = indirect_to_ptr(node);
1094 1104
1095 max_index = radix_tree_maxindex(node->height); 1105 max_index = radix_tree_maxindex(node->height);
1096 1106
@@ -1109,7 +1119,8 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
1109 slot = *(((void ***)results)[ret + i]); 1119 slot = *(((void ***)results)[ret + i]);
1110 if (!slot) 1120 if (!slot)
1111 continue; 1121 continue;
1112 results[ret + nr_found] = rcu_dereference_raw(slot); 1122 results[ret + nr_found] =
1123 indirect_to_ptr(rcu_dereference_raw(slot));
1113 nr_found++; 1124 nr_found++;
1114 } 1125 }
1115 ret += nr_found; 1126 ret += nr_found;
@@ -1159,7 +1170,7 @@ radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
1159 results[0] = (void **)&root->rnode; 1170 results[0] = (void **)&root->rnode;
1160 return 1; 1171 return 1;
1161 } 1172 }
1162 node = radix_tree_indirect_to_ptr(node); 1173 node = indirect_to_ptr(node);
1163 1174
1164 max_index = radix_tree_maxindex(node->height); 1175 max_index = radix_tree_maxindex(node->height);
1165 1176
@@ -1195,7 +1206,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
1195 void *newptr; 1206 void *newptr;
1196 1207
1197 BUG_ON(!radix_tree_is_indirect_ptr(to_free)); 1208 BUG_ON(!radix_tree_is_indirect_ptr(to_free));
1198 to_free = radix_tree_indirect_to_ptr(to_free); 1209 to_free = indirect_to_ptr(to_free);
1199 1210
1200 /* 1211 /*
1201 * The candidate node has more than one child, or its child 1212 * The candidate node has more than one child, or its child
@@ -1208,16 +1219,39 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
1208 1219
1209 /* 1220 /*
1210 * We don't need rcu_assign_pointer(), since we are simply 1221 * We don't need rcu_assign_pointer(), since we are simply
1211 * moving the node from one part of the tree to another. If 1222 * moving the node from one part of the tree to another: if it
1212 * it was safe to dereference the old pointer to it 1223 * was safe to dereference the old pointer to it
1213 * (to_free->slots[0]), it will be safe to dereference the new 1224 * (to_free->slots[0]), it will be safe to dereference the new
1214 * one (root->rnode). 1225 * one (root->rnode) as far as dependent read barriers go.
1215 */ 1226 */
1216 newptr = to_free->slots[0]; 1227 newptr = to_free->slots[0];
1217 if (root->height > 1) 1228 if (root->height > 1)
1218 newptr = radix_tree_ptr_to_indirect(newptr); 1229 newptr = ptr_to_indirect(newptr);
1219 root->rnode = newptr; 1230 root->rnode = newptr;
1220 root->height--; 1231 root->height--;
1232
1233 /*
1234 * We have a dilemma here. The node's slot[0] must not be
1235 * NULLed in case there are concurrent lookups expecting to
1236 * find the item. However if this was a bottom-level node,
1237 * then it may be subject to the slot pointer being visible
1238 * to callers dereferencing it. If item corresponding to
1239 * slot[0] is subsequently deleted, these callers would expect
1240 * their slot to become empty sooner or later.
1241 *
1242 * For example, lockless pagecache will look up a slot, deref
1243 * the page pointer, and if the page is 0 refcount it means it
1244 * was concurrently deleted from pagecache so try the deref
1245 * again. Fortunately there is already a requirement for logic
1246 * to retry the entire slot lookup -- the indirect pointer
1247 * problem (replacing direct root node with an indirect pointer
1248 * also results in a stale slot). So tag the slot as indirect
1249 * to force callers to retry.
1250 */
1251 if (root->height == 0)
1252 *((unsigned long *)&to_free->slots[0]) |=
1253 RADIX_TREE_INDIRECT_PTR;
1254
1221 radix_tree_node_free(to_free); 1255 radix_tree_node_free(to_free);
1222 } 1256 }
1223} 1257}
@@ -1254,7 +1288,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
1254 root->rnode = NULL; 1288 root->rnode = NULL;
1255 goto out; 1289 goto out;
1256 } 1290 }
1257 slot = radix_tree_indirect_to_ptr(slot); 1291 slot = indirect_to_ptr(slot);
1258 1292
1259 shift = (height - 1) * RADIX_TREE_MAP_SHIFT; 1293 shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
1260 pathp->node = NULL; 1294 pathp->node = NULL;
@@ -1296,8 +1330,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
1296 radix_tree_node_free(to_free); 1330 radix_tree_node_free(to_free);
1297 1331
1298 if (pathp->node->count) { 1332 if (pathp->node->count) {
1299 if (pathp->node == 1333 if (pathp->node == indirect_to_ptr(root->rnode))
1300 radix_tree_indirect_to_ptr(root->rnode))
1301 radix_tree_shrink(root); 1334 radix_tree_shrink(root);
1302 goto out; 1335 goto out;
1303 } 1336 }