aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNick Piggin <npiggin@suse.de>2006-12-06 23:33:44 -0500
committerLinus Torvalds <torvalds@woody.osdl.org>2006-12-07 11:39:25 -0500
commit7cf9c2c76c1a17b32f2da85b50cd4fe468ed44b5 (patch)
treec3ed3e92e21f19ef744c9ea6829f4ff8d06e9f14
parent36de6437866bbb1d37e2312ff4f95ee4ed6d2b61 (diff)
[PATCH] radix-tree: RCU lockless readside
Make radix tree lookups safe to be performed without locks. Readers are protected against nodes being deleted by using RCU based freeing. Readers are protected against new node insertion by using memory barriers to ensure the node itself will be properly written before it is visible in the radix tree. Each radix tree node keeps a record of their height (above leaf nodes). This height does not change after insertion -- when the radix tree is extended, higher nodes are only inserted in the top. So a lookup can take the pointer to what is *now* the root node, and traverse down it even if the tree is concurrently extended and this node becomes a subtree of a new root. "Direct" pointers (tree height of 0, where root->rnode points directly to the data item) are handled by using the low bit of the pointer to signal whether rnode is a direct pointer or a pointer to a radix tree node. When a reader wants to traverse the next branch, they will take a copy of the pointer. This pointer will be either NULL (and the branch is empty) or non-NULL (and will point to a valid node). [akpm@osdl.org: cleanups] [Lee.Schermerhorn@hp.com: bugfixes, comments, simplifications] [clameter@sgi.com: build fix] Signed-off-by: Nick Piggin <npiggin@suse.de> Cc: "Paul E. McKenney" <paulmck@us.ibm.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@engr.sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--include/linux/radix-tree.h101
-rw-r--r--lib/radix-tree.c327
-rw-r--r--mm/migrate.c19
3 files changed, 340 insertions, 107 deletions
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index cbfa11537421..0deb842541ac 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -1,6 +1,7 @@
1/* 1/*
2 * Copyright (C) 2001 Momchil Velikov 2 * Copyright (C) 2001 Momchil Velikov
3 * Portions Copyright (C) 2001 Christoph Hellwig 3 * Portions Copyright (C) 2001 Christoph Hellwig
4 * Copyright (C) 2006 Nick Piggin
4 * 5 *
5 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as 7 * modify it under the terms of the GNU General Public License as
@@ -21,6 +22,35 @@
21 22
22#include <linux/preempt.h> 23#include <linux/preempt.h>
23#include <linux/types.h> 24#include <linux/types.h>
25#include <linux/kernel.h>
26#include <linux/rcupdate.h>
27
28/*
29 * A direct pointer (root->rnode pointing directly to a data item,
30 * rather than another radix_tree_node) is signalled by the low bit
31 * set in the root->rnode pointer.
32 *
33 * In this case root->height is also NULL, but the direct pointer tests are
34 * needed for RCU lookups when root->height is unreliable.
35 */
36#define RADIX_TREE_DIRECT_PTR 1
37
38static inline void *radix_tree_ptr_to_direct(void *ptr)
39{
40 return (void *)((unsigned long)ptr | RADIX_TREE_DIRECT_PTR);
41}
42
43static inline void *radix_tree_direct_to_ptr(void *ptr)
44{
45 return (void *)((unsigned long)ptr & ~RADIX_TREE_DIRECT_PTR);
46}
47
48static inline int radix_tree_is_direct_ptr(void *ptr)
49{
50 return (int)((unsigned long)ptr & RADIX_TREE_DIRECT_PTR);
51}
52
53/*** radix-tree API starts here ***/
24 54
25#define RADIX_TREE_MAX_TAGS 2 55#define RADIX_TREE_MAX_TAGS 2
26 56
@@ -47,6 +77,77 @@ do { \
47 (root)->rnode = NULL; \ 77 (root)->rnode = NULL; \
48} while (0) 78} while (0)
49 79
80/**
81 * Radix-tree synchronization
82 *
83 * The radix-tree API requires that users provide all synchronisation (with
84 * specific exceptions, noted below).
85 *
86 * Synchronization of access to the data items being stored in the tree, and
87 * management of their lifetimes must be completely managed by API users.
88 *
89 * For API usage, in general,
90 * - any function _modifying_ the the tree or tags (inserting or deleting
91 * items, setting or clearing tags must exclude other modifications, and
92 * exclude any functions reading the tree.
93 * - any function _reading_ the the tree or tags (looking up items or tags,
94 * gang lookups) must exclude modifications to the tree, but may occur
95 * concurrently with other readers.
96 *
97 * The notable exceptions to this rule are the following functions:
98 * radix_tree_lookup
99 * radix_tree_tag_get
100 * radix_tree_gang_lookup
101 * radix_tree_gang_lookup_tag
102 * radix_tree_tagged
103 *
104 * The first 4 functions are able to be called locklessly, using RCU. The
105 * caller must ensure calls to these functions are made within rcu_read_lock()
106 * regions. Other readers (lock-free or otherwise) and modifications may be
107 * running concurrently.
108 *
109 * It is still required that the caller manage the synchronization and lifetimes
110 * of the items. So if RCU lock-free lookups are used, typically this would mean
111 * that the items have their own locks, or are amenable to lock-free access; and
112 * that the items are freed by RCU (or only freed after having been deleted from
113 * the radix tree *and* a synchronize_rcu() grace period).
114 *
115 * (Note, rcu_assign_pointer and rcu_dereference are not needed to control
116 * access to data items when inserting into or looking up from the radix tree)
117 *
118 * radix_tree_tagged is able to be called without locking or RCU.
119 */
120
121/**
122 * radix_tree_deref_slot - dereference a slot
123 * @pslot: pointer to slot, returned by radix_tree_lookup_slot
124 * Returns: item that was stored in that slot with any direct pointer flag
125 * removed.
126 *
127 * For use with radix_tree_lookup_slot(). Caller must hold tree at least read
128 * locked across slot lookup and dereference. More likely, will be used with
129 * radix_tree_replace_slot(), as well, so caller will hold tree write locked.
130 */
131static inline void *radix_tree_deref_slot(void **pslot)
132{
133 return radix_tree_direct_to_ptr(*pslot);
134}
135/**
136 * radix_tree_replace_slot - replace item in a slot
137 * @pslot: pointer to slot, returned by radix_tree_lookup_slot
138 * @item: new item to store in the slot.
139 *
140 * For use with radix_tree_lookup_slot(). Caller must hold tree write locked
141 * across slot lookup and replacement.
142 */
143static inline void radix_tree_replace_slot(void **pslot, void *item)
144{
145 BUG_ON(radix_tree_is_direct_ptr(item));
146 rcu_assign_pointer(*pslot,
147 (void *)((unsigned long)item |
148 ((unsigned long)*pslot & RADIX_TREE_DIRECT_PTR)));
149}
150
50int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); 151int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
51void *radix_tree_lookup(struct radix_tree_root *, unsigned long); 152void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
52void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); 153void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 9eb25955019f..e2cefabb5aa0 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -2,6 +2,7 @@
2 * Copyright (C) 2001 Momchil Velikov 2 * Copyright (C) 2001 Momchil Velikov
3 * Portions Copyright (C) 2001 Christoph Hellwig 3 * Portions Copyright (C) 2001 Christoph Hellwig
4 * Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com> 4 * Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
5 * Copyright (C) 2006 Nick Piggin
5 * 6 *
6 * This program is free software; you can redistribute it and/or 7 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as 8 * modify it under the terms of the GNU General Public License as
@@ -30,6 +31,7 @@
30#include <linux/gfp.h> 31#include <linux/gfp.h>
31#include <linux/string.h> 32#include <linux/string.h>
32#include <linux/bitops.h> 33#include <linux/bitops.h>
34#include <linux/rcupdate.h>
33 35
34 36
35#ifdef __KERNEL__ 37#ifdef __KERNEL__
@@ -45,7 +47,9 @@
45 ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) 47 ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
46 48
47struct radix_tree_node { 49struct radix_tree_node {
50 unsigned int height; /* Height from the bottom */
48 unsigned int count; 51 unsigned int count;
52 struct rcu_head rcu_head;
49 void *slots[RADIX_TREE_MAP_SIZE]; 53 void *slots[RADIX_TREE_MAP_SIZE];
50 unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; 54 unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
51}; 55};
@@ -100,13 +104,21 @@ radix_tree_node_alloc(struct radix_tree_root *root)
100 rtp->nr--; 104 rtp->nr--;
101 } 105 }
102 } 106 }
107 BUG_ON(radix_tree_is_direct_ptr(ret));
103 return ret; 108 return ret;
104} 109}
105 110
111static void radix_tree_node_rcu_free(struct rcu_head *head)
112{
113 struct radix_tree_node *node =
114 container_of(head, struct radix_tree_node, rcu_head);
115 kmem_cache_free(radix_tree_node_cachep, node);
116}
117
106static inline void 118static inline void
107radix_tree_node_free(struct radix_tree_node *node) 119radix_tree_node_free(struct radix_tree_node *node)
108{ 120{
109 kmem_cache_free(radix_tree_node_cachep, node); 121 call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
110} 122}
111 123
112/* 124/*
@@ -222,11 +234,12 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
222 } 234 }
223 235
224 do { 236 do {
237 unsigned int newheight;
225 if (!(node = radix_tree_node_alloc(root))) 238 if (!(node = radix_tree_node_alloc(root)))
226 return -ENOMEM; 239 return -ENOMEM;
227 240
228 /* Increase the height. */ 241 /* Increase the height. */
229 node->slots[0] = root->rnode; 242 node->slots[0] = radix_tree_direct_to_ptr(root->rnode);
230 243
231 /* Propagate the aggregated tag info into the new root */ 244 /* Propagate the aggregated tag info into the new root */
232 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { 245 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
@@ -234,9 +247,11 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
234 tag_set(node, tag, 0); 247 tag_set(node, tag, 0);
235 } 248 }
236 249
250 newheight = root->height+1;
251 node->height = newheight;
237 node->count = 1; 252 node->count = 1;
238 root->rnode = node; 253 rcu_assign_pointer(root->rnode, node);
239 root->height++; 254 root->height = newheight;
240 } while (height > root->height); 255 } while (height > root->height);
241out: 256out:
242 return 0; 257 return 0;
@@ -258,6 +273,8 @@ int radix_tree_insert(struct radix_tree_root *root,
258 int offset; 273 int offset;
259 int error; 274 int error;
260 275
276 BUG_ON(radix_tree_is_direct_ptr(item));
277
261 /* Make sure the tree is high enough. */ 278 /* Make sure the tree is high enough. */
262 if (index > radix_tree_maxindex(root->height)) { 279 if (index > radix_tree_maxindex(root->height)) {
263 error = radix_tree_extend(root, index); 280 error = radix_tree_extend(root, index);
@@ -275,11 +292,12 @@ int radix_tree_insert(struct radix_tree_root *root,
275 /* Have to add a child node. */ 292 /* Have to add a child node. */
276 if (!(slot = radix_tree_node_alloc(root))) 293 if (!(slot = radix_tree_node_alloc(root)))
277 return -ENOMEM; 294 return -ENOMEM;
295 slot->height = height;
278 if (node) { 296 if (node) {
279 node->slots[offset] = slot; 297 rcu_assign_pointer(node->slots[offset], slot);
280 node->count++; 298 node->count++;
281 } else 299 } else
282 root->rnode = slot; 300 rcu_assign_pointer(root->rnode, slot);
283 } 301 }
284 302
285 /* Go a level down */ 303 /* Go a level down */
@@ -295,11 +313,11 @@ int radix_tree_insert(struct radix_tree_root *root,
295 313
296 if (node) { 314 if (node) {
297 node->count++; 315 node->count++;
298 node->slots[offset] = item; 316 rcu_assign_pointer(node->slots[offset], item);
299 BUG_ON(tag_get(node, 0, offset)); 317 BUG_ON(tag_get(node, 0, offset));
300 BUG_ON(tag_get(node, 1, offset)); 318 BUG_ON(tag_get(node, 1, offset));
301 } else { 319 } else {
302 root->rnode = item; 320 rcu_assign_pointer(root->rnode, radix_tree_ptr_to_direct(item));
303 BUG_ON(root_tag_get(root, 0)); 321 BUG_ON(root_tag_get(root, 0));
304 BUG_ON(root_tag_get(root, 1)); 322 BUG_ON(root_tag_get(root, 1));
305 } 323 }
@@ -308,49 +326,54 @@ int radix_tree_insert(struct radix_tree_root *root,
308} 326}
309EXPORT_SYMBOL(radix_tree_insert); 327EXPORT_SYMBOL(radix_tree_insert);
310 328
311static inline void **__lookup_slot(struct radix_tree_root *root, 329/**
312 unsigned long index) 330 * radix_tree_lookup_slot - lookup a slot in a radix tree
331 * @root: radix tree root
332 * @index: index key
333 *
334 * Returns: the slot corresponding to the position @index in the
335 * radix tree @root. This is useful for update-if-exists operations.
336 *
337 * This function cannot be called under rcu_read_lock, it must be
338 * excluded from writers, as must the returned slot for subsequent
339 * use by radix_tree_deref_slot() and radix_tree_replace slot.
340 * Caller must hold tree write locked across slot lookup and
341 * replace.
342 */
343void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
313{ 344{
314 unsigned int height, shift; 345 unsigned int height, shift;
315 struct radix_tree_node **slot; 346 struct radix_tree_node *node, **slot;
316
317 height = root->height;
318 347
319 if (index > radix_tree_maxindex(height)) 348 node = root->rnode;
349 if (node == NULL)
320 return NULL; 350 return NULL;
321 351
322 if (height == 0 && root->rnode) 352 if (radix_tree_is_direct_ptr(node)) {
353 if (index > 0)
354 return NULL;
323 return (void **)&root->rnode; 355 return (void **)&root->rnode;
356 }
357
358 height = node->height;
359 if (index > radix_tree_maxindex(height))
360 return NULL;
324 361
325 shift = (height-1) * RADIX_TREE_MAP_SHIFT; 362 shift = (height-1) * RADIX_TREE_MAP_SHIFT;
326 slot = &root->rnode;
327 363
328 while (height > 0) { 364 do {
329 if (*slot == NULL) 365 slot = (struct radix_tree_node **)
366 (node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK));
367 node = *slot;
368 if (node == NULL)
330 return NULL; 369 return NULL;
331 370
332 slot = (struct radix_tree_node **)
333 ((*slot)->slots +
334 ((index >> shift) & RADIX_TREE_MAP_MASK));
335 shift -= RADIX_TREE_MAP_SHIFT; 371 shift -= RADIX_TREE_MAP_SHIFT;
336 height--; 372 height--;
337 } 373 } while (height > 0);
338 374
339 return (void **)slot; 375 return (void **)slot;
340} 376}
341
342/**
343 * radix_tree_lookup_slot - lookup a slot in a radix tree
344 * @root: radix tree root
345 * @index: index key
346 *
347 * Lookup the slot corresponding to the position @index in the radix tree
348 * @root. This is useful for update-if-exists operations.
349 */
350void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
351{
352 return __lookup_slot(root, index);
353}
354EXPORT_SYMBOL(radix_tree_lookup_slot); 377EXPORT_SYMBOL(radix_tree_lookup_slot);
355 378
356/** 379/**
@@ -359,13 +382,45 @@ EXPORT_SYMBOL(radix_tree_lookup_slot);
359 * @index: index key 382 * @index: index key
360 * 383 *
361 * Lookup the item at the position @index in the radix tree @root. 384 * Lookup the item at the position @index in the radix tree @root.
385 *
386 * This function can be called under rcu_read_lock, however the caller
387 * must manage lifetimes of leaf nodes (eg. RCU may also be used to free
388 * them safely). No RCU barriers are required to access or modify the
389 * returned item, however.
362 */ 390 */
363void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) 391void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
364{ 392{
365 void **slot; 393 unsigned int height, shift;
394 struct radix_tree_node *node, **slot;
395
396 node = rcu_dereference(root->rnode);
397 if (node == NULL)
398 return NULL;
399
400 if (radix_tree_is_direct_ptr(node)) {
401 if (index > 0)
402 return NULL;
403 return radix_tree_direct_to_ptr(node);
404 }
405
406 height = node->height;
407 if (index > radix_tree_maxindex(height))
408 return NULL;
409
410 shift = (height-1) * RADIX_TREE_MAP_SHIFT;
366 411
367 slot = __lookup_slot(root, index); 412 do {
368 return slot != NULL ? *slot : NULL; 413 slot = (struct radix_tree_node **)
414 (node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK));
415 node = rcu_dereference(*slot);
416 if (node == NULL)
417 return NULL;
418
419 shift -= RADIX_TREE_MAP_SHIFT;
420 height--;
421 } while (height > 0);
422
423 return node;
369} 424}
370EXPORT_SYMBOL(radix_tree_lookup); 425EXPORT_SYMBOL(radix_tree_lookup);
371 426
@@ -495,27 +550,30 @@ int radix_tree_tag_get(struct radix_tree_root *root,
495 unsigned long index, unsigned int tag) 550 unsigned long index, unsigned int tag)
496{ 551{
497 unsigned int height, shift; 552 unsigned int height, shift;
498 struct radix_tree_node *slot; 553 struct radix_tree_node *node;
499 int saw_unset_tag = 0; 554 int saw_unset_tag = 0;
500 555
501 height = root->height;
502 if (index > radix_tree_maxindex(height))
503 return 0;
504
505 /* check the root's tag bit */ 556 /* check the root's tag bit */
506 if (!root_tag_get(root, tag)) 557 if (!root_tag_get(root, tag))
507 return 0; 558 return 0;
508 559
509 if (height == 0) 560 node = rcu_dereference(root->rnode);
510 return 1; 561 if (node == NULL)
562 return 0;
563
564 if (radix_tree_is_direct_ptr(node))
565 return (index == 0);
566
567 height = node->height;
568 if (index > radix_tree_maxindex(height))
569 return 0;
511 570
512 shift = (height - 1) * RADIX_TREE_MAP_SHIFT; 571 shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
513 slot = root->rnode;
514 572
515 for ( ; ; ) { 573 for ( ; ; ) {
516 int offset; 574 int offset;
517 575
518 if (slot == NULL) 576 if (node == NULL)
519 return 0; 577 return 0;
520 578
521 offset = (index >> shift) & RADIX_TREE_MAP_MASK; 579 offset = (index >> shift) & RADIX_TREE_MAP_MASK;
@@ -524,15 +582,15 @@ int radix_tree_tag_get(struct radix_tree_root *root,
524 * This is just a debug check. Later, we can bale as soon as 582 * This is just a debug check. Later, we can bale as soon as
525 * we see an unset tag. 583 * we see an unset tag.
526 */ 584 */
527 if (!tag_get(slot, tag, offset)) 585 if (!tag_get(node, tag, offset))
528 saw_unset_tag = 1; 586 saw_unset_tag = 1;
529 if (height == 1) { 587 if (height == 1) {
530 int ret = tag_get(slot, tag, offset); 588 int ret = tag_get(node, tag, offset);
531 589
532 BUG_ON(ret && saw_unset_tag); 590 BUG_ON(ret && saw_unset_tag);
533 return !!ret; 591 return !!ret;
534 } 592 }
535 slot = slot->slots[offset]; 593 node = rcu_dereference(node->slots[offset]);
536 shift -= RADIX_TREE_MAP_SHIFT; 594 shift -= RADIX_TREE_MAP_SHIFT;
537 height--; 595 height--;
538 } 596 }
@@ -541,47 +599,45 @@ EXPORT_SYMBOL(radix_tree_tag_get);
541#endif 599#endif
542 600
543static unsigned int 601static unsigned int
544__lookup(struct radix_tree_root *root, void **results, unsigned long index, 602__lookup(struct radix_tree_node *slot, void **results, unsigned long index,
545 unsigned int max_items, unsigned long *next_index) 603 unsigned int max_items, unsigned long *next_index)
546{ 604{
547 unsigned int nr_found = 0; 605 unsigned int nr_found = 0;
548 unsigned int shift, height; 606 unsigned int shift, height;
549 struct radix_tree_node *slot;
550 unsigned long i; 607 unsigned long i;
551 608
552 height = root->height; 609 height = slot->height;
553 if (height == 0) { 610 if (height == 0)
554 if (root->rnode && index == 0)
555 results[nr_found++] = root->rnode;
556 goto out; 611 goto out;
557 }
558
559 shift = (height-1) * RADIX_TREE_MAP_SHIFT; 612 shift = (height-1) * RADIX_TREE_MAP_SHIFT;
560 slot = root->rnode;
561 613
562 for ( ; height > 1; height--) { 614 for ( ; height > 1; height--) {
563 615 i = (index >> shift) & RADIX_TREE_MAP_MASK;
564 for (i = (index >> shift) & RADIX_TREE_MAP_MASK ; 616 for (;;) {
565 i < RADIX_TREE_MAP_SIZE; i++) {
566 if (slot->slots[i] != NULL) 617 if (slot->slots[i] != NULL)
567 break; 618 break;
568 index &= ~((1UL << shift) - 1); 619 index &= ~((1UL << shift) - 1);
569 index += 1UL << shift; 620 index += 1UL << shift;
570 if (index == 0) 621 if (index == 0)
571 goto out; /* 32-bit wraparound */ 622 goto out; /* 32-bit wraparound */
623 i++;
624 if (i == RADIX_TREE_MAP_SIZE)
625 goto out;
572 } 626 }
573 if (i == RADIX_TREE_MAP_SIZE)
574 goto out;
575 627
576 shift -= RADIX_TREE_MAP_SHIFT; 628 shift -= RADIX_TREE_MAP_SHIFT;
577 slot = slot->slots[i]; 629 slot = rcu_dereference(slot->slots[i]);
630 if (slot == NULL)
631 goto out;
578 } 632 }
579 633
580 /* Bottom level: grab some items */ 634 /* Bottom level: grab some items */
581 for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) { 635 for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) {
636 struct radix_tree_node *node;
582 index++; 637 index++;
583 if (slot->slots[i]) { 638 node = slot->slots[i];
584 results[nr_found++] = slot->slots[i]; 639 if (node) {
640 results[nr_found++] = rcu_dereference(node);
585 if (nr_found == max_items) 641 if (nr_found == max_items)
586 goto out; 642 goto out;
587 } 643 }
@@ -603,28 +659,51 @@ out:
603 * *@results. 659 * *@results.
604 * 660 *
605 * The implementation is naive. 661 * The implementation is naive.
662 *
663 * Like radix_tree_lookup, radix_tree_gang_lookup may be called under
664 * rcu_read_lock. In this case, rather than the returned results being
665 * an atomic snapshot of the tree at a single point in time, the semantics
666 * of an RCU protected gang lookup are as though multiple radix_tree_lookups
667 * have been issued in individual locks, and results stored in 'results'.
606 */ 668 */
607unsigned int 669unsigned int
608radix_tree_gang_lookup(struct radix_tree_root *root, void **results, 670radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
609 unsigned long first_index, unsigned int max_items) 671 unsigned long first_index, unsigned int max_items)
610{ 672{
611 const unsigned long max_index = radix_tree_maxindex(root->height); 673 unsigned long max_index;
674 struct radix_tree_node *node;
612 unsigned long cur_index = first_index; 675 unsigned long cur_index = first_index;
613 unsigned int ret = 0; 676 unsigned int ret;
677
678 node = rcu_dereference(root->rnode);
679 if (!node)
680 return 0;
614 681
682 if (radix_tree_is_direct_ptr(node)) {
683 if (first_index > 0)
684 return 0;
685 node = radix_tree_direct_to_ptr(node);
686 results[0] = rcu_dereference(node);
687 return 1;
688 }
689
690 max_index = radix_tree_maxindex(node->height);
691
692 ret = 0;
615 while (ret < max_items) { 693 while (ret < max_items) {
616 unsigned int nr_found; 694 unsigned int nr_found;
617 unsigned long next_index; /* Index of next search */ 695 unsigned long next_index; /* Index of next search */
618 696
619 if (cur_index > max_index) 697 if (cur_index > max_index)
620 break; 698 break;
621 nr_found = __lookup(root, results + ret, cur_index, 699 nr_found = __lookup(node, results + ret, cur_index,
622 max_items - ret, &next_index); 700 max_items - ret, &next_index);
623 ret += nr_found; 701 ret += nr_found;
624 if (next_index == 0) 702 if (next_index == 0)
625 break; 703 break;
626 cur_index = next_index; 704 cur_index = next_index;
627 } 705 }
706
628 return ret; 707 return ret;
629} 708}
630EXPORT_SYMBOL(radix_tree_gang_lookup); 709EXPORT_SYMBOL(radix_tree_gang_lookup);
@@ -634,55 +713,64 @@ EXPORT_SYMBOL(radix_tree_gang_lookup);
634 * open-coding the search. 713 * open-coding the search.
635 */ 714 */
636static unsigned int 715static unsigned int
637__lookup_tag(struct radix_tree_root *root, void **results, unsigned long index, 716__lookup_tag(struct radix_tree_node *slot, void **results, unsigned long index,
638 unsigned int max_items, unsigned long *next_index, unsigned int tag) 717 unsigned int max_items, unsigned long *next_index, unsigned int tag)
639{ 718{
640 unsigned int nr_found = 0; 719 unsigned int nr_found = 0;
641 unsigned int shift; 720 unsigned int shift, height;
642 unsigned int height = root->height;
643 struct radix_tree_node *slot;
644 721
645 if (height == 0) { 722 height = slot->height;
646 if (root->rnode && index == 0) 723 if (height == 0)
647 results[nr_found++] = root->rnode;
648 goto out; 724 goto out;
649 } 725 shift = (height-1) * RADIX_TREE_MAP_SHIFT;
650
651 shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
652 slot = root->rnode;
653 726
654 do { 727 while (height > 0) {
655 unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK; 728 unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK ;
656 729
657 for ( ; i < RADIX_TREE_MAP_SIZE; i++) { 730 for (;;) {
658 if (tag_get(slot, tag, i)) { 731 if (tag_get(slot, tag, i))
659 BUG_ON(slot->slots[i] == NULL);
660 break; 732 break;
661 }
662 index &= ~((1UL << shift) - 1); 733 index &= ~((1UL << shift) - 1);
663 index += 1UL << shift; 734 index += 1UL << shift;
664 if (index == 0) 735 if (index == 0)
665 goto out; /* 32-bit wraparound */ 736 goto out; /* 32-bit wraparound */
737 i++;
738 if (i == RADIX_TREE_MAP_SIZE)
739 goto out;
666 } 740 }
667 if (i == RADIX_TREE_MAP_SIZE)
668 goto out;
669 height--; 741 height--;
670 if (height == 0) { /* Bottom level: grab some items */ 742 if (height == 0) { /* Bottom level: grab some items */
671 unsigned long j = index & RADIX_TREE_MAP_MASK; 743 unsigned long j = index & RADIX_TREE_MAP_MASK;
672 744
673 for ( ; j < RADIX_TREE_MAP_SIZE; j++) { 745 for ( ; j < RADIX_TREE_MAP_SIZE; j++) {
746 struct radix_tree_node *node;
674 index++; 747 index++;
675 if (tag_get(slot, tag, j)) { 748 if (!tag_get(slot, tag, j))
676 BUG_ON(slot->slots[j] == NULL); 749 continue;
677 results[nr_found++] = slot->slots[j]; 750 node = slot->slots[j];
751 /*
752 * Even though the tag was found set, we need to
753 * recheck that we have a non-NULL node, because
754 * if this lookup is lockless, it may have been
755 * subsequently deleted.
756 *
757 * Similar care must be taken in any place that
758 * lookup ->slots[x] without a lock (ie. can't
759 * rely on its value remaining the same).
760 */
761 if (node) {
762 node = rcu_dereference(node);
763 results[nr_found++] = node;
678 if (nr_found == max_items) 764 if (nr_found == max_items)
679 goto out; 765 goto out;
680 } 766 }
681 } 767 }
682 } 768 }
683 shift -= RADIX_TREE_MAP_SHIFT; 769 shift -= RADIX_TREE_MAP_SHIFT;
684 slot = slot->slots[i]; 770 slot = rcu_dereference(slot->slots[i]);
685 } while (height > 0); 771 if (slot == NULL)
772 break;
773 }
686out: 774out:
687 *next_index = index; 775 *next_index = index;
688 return nr_found; 776 return nr_found;
@@ -706,27 +794,44 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
706 unsigned long first_index, unsigned int max_items, 794 unsigned long first_index, unsigned int max_items,
707 unsigned int tag) 795 unsigned int tag)
708{ 796{
709 const unsigned long max_index = radix_tree_maxindex(root->height); 797 struct radix_tree_node *node;
798 unsigned long max_index;
710 unsigned long cur_index = first_index; 799 unsigned long cur_index = first_index;
711 unsigned int ret = 0; 800 unsigned int ret;
712 801
713 /* check the root's tag bit */ 802 /* check the root's tag bit */
714 if (!root_tag_get(root, tag)) 803 if (!root_tag_get(root, tag))
715 return 0; 804 return 0;
716 805
806 node = rcu_dereference(root->rnode);
807 if (!node)
808 return 0;
809
810 if (radix_tree_is_direct_ptr(node)) {
811 if (first_index > 0)
812 return 0;
813 node = radix_tree_direct_to_ptr(node);
814 results[0] = rcu_dereference(node);
815 return 1;
816 }
817
818 max_index = radix_tree_maxindex(node->height);
819
820 ret = 0;
717 while (ret < max_items) { 821 while (ret < max_items) {
718 unsigned int nr_found; 822 unsigned int nr_found;
719 unsigned long next_index; /* Index of next search */ 823 unsigned long next_index; /* Index of next search */
720 824
721 if (cur_index > max_index) 825 if (cur_index > max_index)
722 break; 826 break;
723 nr_found = __lookup_tag(root, results + ret, cur_index, 827 nr_found = __lookup_tag(node, results + ret, cur_index,
724 max_items - ret, &next_index, tag); 828 max_items - ret, &next_index, tag);
725 ret += nr_found; 829 ret += nr_found;
726 if (next_index == 0) 830 if (next_index == 0)
727 break; 831 break;
728 cur_index = next_index; 832 cur_index = next_index;
729 } 833 }
834
730 return ret; 835 return ret;
731} 836}
732EXPORT_SYMBOL(radix_tree_gang_lookup_tag); 837EXPORT_SYMBOL(radix_tree_gang_lookup_tag);
@@ -742,8 +847,19 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
742 root->rnode->count == 1 && 847 root->rnode->count == 1 &&
743 root->rnode->slots[0]) { 848 root->rnode->slots[0]) {
744 struct radix_tree_node *to_free = root->rnode; 849 struct radix_tree_node *to_free = root->rnode;
850 void *newptr;
745 851
746 root->rnode = to_free->slots[0]; 852 /*
853 * We don't need rcu_assign_pointer(), since we are simply
854 * moving the node from one part of the tree to another. If
855 * it was safe to dereference the old pointer to it
856 * (to_free->slots[0]), it will be safe to dereference the new
857 * one (root->rnode).
858 */
859 newptr = to_free->slots[0];
860 if (root->height == 1)
861 newptr = radix_tree_ptr_to_direct(newptr);
862 root->rnode = newptr;
747 root->height--; 863 root->height--;
748 /* must only free zeroed nodes into the slab */ 864 /* must only free zeroed nodes into the slab */
749 tag_clear(to_free, 0, 0); 865 tag_clear(to_free, 0, 0);
@@ -767,6 +883,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
767{ 883{
768 struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path; 884 struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
769 struct radix_tree_node *slot = NULL; 885 struct radix_tree_node *slot = NULL;
886 struct radix_tree_node *to_free;
770 unsigned int height, shift; 887 unsigned int height, shift;
771 int tag; 888 int tag;
772 int offset; 889 int offset;
@@ -777,6 +894,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
777 894
778 slot = root->rnode; 895 slot = root->rnode;
779 if (height == 0 && root->rnode) { 896 if (height == 0 && root->rnode) {
897 slot = radix_tree_direct_to_ptr(slot);
780 root_tag_clear_all(root); 898 root_tag_clear_all(root);
781 root->rnode = NULL; 899 root->rnode = NULL;
782 goto out; 900 goto out;
@@ -809,10 +927,17 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
809 radix_tree_tag_clear(root, index, tag); 927 radix_tree_tag_clear(root, index, tag);
810 } 928 }
811 929
930 to_free = NULL;
812 /* Now free the nodes we do not need anymore */ 931 /* Now free the nodes we do not need anymore */
813 while (pathp->node) { 932 while (pathp->node) {
814 pathp->node->slots[pathp->offset] = NULL; 933 pathp->node->slots[pathp->offset] = NULL;
815 pathp->node->count--; 934 pathp->node->count--;
935 /*
936 * Queue the node for deferred freeing after the
937 * last reference to it disappears (set NULL, above).
938 */
939 if (to_free)
940 radix_tree_node_free(to_free);
816 941
817 if (pathp->node->count) { 942 if (pathp->node->count) {
818 if (pathp->node == root->rnode) 943 if (pathp->node == root->rnode)
@@ -821,13 +946,15 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
821 } 946 }
822 947
823 /* Node with zero slots in use so free it */ 948 /* Node with zero slots in use so free it */
824 radix_tree_node_free(pathp->node); 949 to_free = pathp->node;
825
826 pathp--; 950 pathp--;
951
827 } 952 }
828 root_tag_clear_all(root); 953 root_tag_clear_all(root);
829 root->height = 0; 954 root->height = 0;
830 root->rnode = NULL; 955 root->rnode = NULL;
956 if (to_free)
957 radix_tree_node_free(to_free);
831 958
832out: 959out:
833 return slot; 960 return slot;
diff --git a/mm/migrate.c b/mm/migrate.c
index b4979d423d2b..e9b161bde95b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -294,7 +294,7 @@ out:
294static int migrate_page_move_mapping(struct address_space *mapping, 294static int migrate_page_move_mapping(struct address_space *mapping,
295 struct page *newpage, struct page *page) 295 struct page *newpage, struct page *page)
296{ 296{
297 struct page **radix_pointer; 297 void **pslot;
298 298
299 if (!mapping) { 299 if (!mapping) {
300 /* Anonymous page */ 300 /* Anonymous page */
@@ -305,12 +305,11 @@ static int migrate_page_move_mapping(struct address_space *mapping,
305 305
306 write_lock_irq(&mapping->tree_lock); 306 write_lock_irq(&mapping->tree_lock);
307 307
308 radix_pointer = (struct page **)radix_tree_lookup_slot( 308 pslot = radix_tree_lookup_slot(&mapping->page_tree,
309 &mapping->page_tree, 309 page_index(page));
310 page_index(page));
311 310
312 if (page_count(page) != 2 + !!PagePrivate(page) || 311 if (page_count(page) != 2 + !!PagePrivate(page) ||
313 *radix_pointer != page) { 312 (struct page *)radix_tree_deref_slot(pslot) != page) {
314 write_unlock_irq(&mapping->tree_lock); 313 write_unlock_irq(&mapping->tree_lock);
315 return -EAGAIN; 314 return -EAGAIN;
316 } 315 }
@@ -318,7 +317,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
318 /* 317 /*
319 * Now we know that no one else is looking at the page. 318 * Now we know that no one else is looking at the page.
320 */ 319 */
321 get_page(newpage); 320 get_page(newpage); /* add cache reference */
322#ifdef CONFIG_SWAP 321#ifdef CONFIG_SWAP
323 if (PageSwapCache(page)) { 322 if (PageSwapCache(page)) {
324 SetPageSwapCache(newpage); 323 SetPageSwapCache(newpage);
@@ -326,8 +325,14 @@ static int migrate_page_move_mapping(struct address_space *mapping,
326 } 325 }
327#endif 326#endif
328 327
329 *radix_pointer = newpage; 328 radix_tree_replace_slot(pslot, newpage);
329
330 /*
331 * Drop cache reference from old page.
332 * We know this isn't the last reference.
333 */
330 __put_page(page); 334 __put_page(page);
335
331 write_unlock_irq(&mapping->tree_lock); 336 write_unlock_irq(&mapping->tree_lock);
332 337
333 return 0; 338 return 0;