aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/radix-tree.h101
-rw-r--r--lib/radix-tree.c327
-rw-r--r--mm/migrate.c19
3 files changed, 340 insertions, 107 deletions
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index cbfa11537421..0deb842541ac 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -1,6 +1,7 @@
1/* 1/*
2 * Copyright (C) 2001 Momchil Velikov 2 * Copyright (C) 2001 Momchil Velikov
3 * Portions Copyright (C) 2001 Christoph Hellwig 3 * Portions Copyright (C) 2001 Christoph Hellwig
4 * Copyright (C) 2006 Nick Piggin
4 * 5 *
5 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as 7 * modify it under the terms of the GNU General Public License as
@@ -21,6 +22,35 @@
21 22
22#include <linux/preempt.h> 23#include <linux/preempt.h>
23#include <linux/types.h> 24#include <linux/types.h>
25#include <linux/kernel.h>
26#include <linux/rcupdate.h>
27
28/*
29 * A direct pointer (root->rnode pointing directly to a data item,
30 * rather than another radix_tree_node) is signalled by the low bit
31 * set in the root->rnode pointer.
32 *
33 * In this case root->height is also NULL, but the direct pointer tests are
34 * needed for RCU lookups when root->height is unreliable.
35 */
36#define RADIX_TREE_DIRECT_PTR 1
37
38static inline void *radix_tree_ptr_to_direct(void *ptr)
39{
40 return (void *)((unsigned long)ptr | RADIX_TREE_DIRECT_PTR);
41}
42
43static inline void *radix_tree_direct_to_ptr(void *ptr)
44{
45 return (void *)((unsigned long)ptr & ~RADIX_TREE_DIRECT_PTR);
46}
47
48static inline int radix_tree_is_direct_ptr(void *ptr)
49{
50 return (int)((unsigned long)ptr & RADIX_TREE_DIRECT_PTR);
51}
52
53/*** radix-tree API starts here ***/
24 54
25#define RADIX_TREE_MAX_TAGS 2 55#define RADIX_TREE_MAX_TAGS 2
26 56
@@ -47,6 +77,77 @@ do { \
47 (root)->rnode = NULL; \ 77 (root)->rnode = NULL; \
48} while (0) 78} while (0)
49 79
80/**
81 * Radix-tree synchronization
82 *
83 * The radix-tree API requires that users provide all synchronisation (with
84 * specific exceptions, noted below).
85 *
86 * Synchronization of access to the data items being stored in the tree, and
87 * management of their lifetimes must be completely managed by API users.
88 *
89 * For API usage, in general,
90 * - any function _modifying_ the the tree or tags (inserting or deleting
91 * items, setting or clearing tags must exclude other modifications, and
92 * exclude any functions reading the tree.
93 * - any function _reading_ the the tree or tags (looking up items or tags,
94 * gang lookups) must exclude modifications to the tree, but may occur
95 * concurrently with other readers.
96 *
97 * The notable exceptions to this rule are the following functions:
98 * radix_tree_lookup
99 * radix_tree_tag_get
100 * radix_tree_gang_lookup
101 * radix_tree_gang_lookup_tag
102 * radix_tree_tagged
103 *
104 * The first 4 functions are able to be called locklessly, using RCU. The
105 * caller must ensure calls to these functions are made within rcu_read_lock()
106 * regions. Other readers (lock-free or otherwise) and modifications may be
107 * running concurrently.
108 *
109 * It is still required that the caller manage the synchronization and lifetimes
110 * of the items. So if RCU lock-free lookups are used, typically this would mean
111 * that the items have their own locks, or are amenable to lock-free access; and
112 * that the items are freed by RCU (or only freed after having been deleted from
113 * the radix tree *and* a synchronize_rcu() grace period).
114 *
115 * (Note, rcu_assign_pointer and rcu_dereference are not needed to control
116 * access to data items when inserting into or looking up from the radix tree)
117 *
118 * radix_tree_tagged is able to be called without locking or RCU.
119 */
120
121/**
122 * radix_tree_deref_slot - dereference a slot
123 * @pslot: pointer to slot, returned by radix_tree_lookup_slot
124 * Returns: item that was stored in that slot with any direct pointer flag
125 * removed.
126 *
127 * For use with radix_tree_lookup_slot(). Caller must hold tree at least read
128 * locked across slot lookup and dereference. More likely, will be used with
129 * radix_tree_replace_slot(), as well, so caller will hold tree write locked.
130 */
131static inline void *radix_tree_deref_slot(void **pslot)
132{
133 return radix_tree_direct_to_ptr(*pslot);
134}
135/**
136 * radix_tree_replace_slot - replace item in a slot
137 * @pslot: pointer to slot, returned by radix_tree_lookup_slot
138 * @item: new item to store in the slot.
139 *
140 * For use with radix_tree_lookup_slot(). Caller must hold tree write locked
141 * across slot lookup and replacement.
142 */
143static inline void radix_tree_replace_slot(void **pslot, void *item)
144{
145 BUG_ON(radix_tree_is_direct_ptr(item));
146 rcu_assign_pointer(*pslot,
147 (void *)((unsigned long)item |
148 ((unsigned long)*pslot & RADIX_TREE_DIRECT_PTR)));
149}
150
50int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); 151int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
51void *radix_tree_lookup(struct radix_tree_root *, unsigned long); 152void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
52void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); 153void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 9eb25955019f..e2cefabb5aa0 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -2,6 +2,7 @@
2 * Copyright (C) 2001 Momchil Velikov 2 * Copyright (C) 2001 Momchil Velikov
3 * Portions Copyright (C) 2001 Christoph Hellwig 3 * Portions Copyright (C) 2001 Christoph Hellwig
4 * Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com> 4 * Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
5 * Copyright (C) 2006 Nick Piggin
5 * 6 *
6 * This program is free software; you can redistribute it and/or 7 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as 8 * modify it under the terms of the GNU General Public License as
@@ -30,6 +31,7 @@
30#include <linux/gfp.h> 31#include <linux/gfp.h>
31#include <linux/string.h> 32#include <linux/string.h>
32#include <linux/bitops.h> 33#include <linux/bitops.h>
34#include <linux/rcupdate.h>
33 35
34 36
35#ifdef __KERNEL__ 37#ifdef __KERNEL__
@@ -45,7 +47,9 @@
45 ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) 47 ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
46 48
47struct radix_tree_node { 49struct radix_tree_node {
50 unsigned int height; /* Height from the bottom */
48 unsigned int count; 51 unsigned int count;
52 struct rcu_head rcu_head;
49 void *slots[RADIX_TREE_MAP_SIZE]; 53 void *slots[RADIX_TREE_MAP_SIZE];
50 unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; 54 unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
51}; 55};
@@ -100,13 +104,21 @@ radix_tree_node_alloc(struct radix_tree_root *root)
100 rtp->nr--; 104 rtp->nr--;
101 } 105 }
102 } 106 }
107 BUG_ON(radix_tree_is_direct_ptr(ret));
103 return ret; 108 return ret;
104} 109}
105 110
111static void radix_tree_node_rcu_free(struct rcu_head *head)
112{
113 struct radix_tree_node *node =
114 container_of(head, struct radix_tree_node, rcu_head);
115 kmem_cache_free(radix_tree_node_cachep, node);
116}
117
106static inline void 118static inline void
107radix_tree_node_free(struct radix_tree_node *node) 119radix_tree_node_free(struct radix_tree_node *node)
108{ 120{
109 kmem_cache_free(radix_tree_node_cachep, node); 121 call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
110} 122}
111 123
112/* 124/*
@@ -222,11 +234,12 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
222 } 234 }
223 235
224 do { 236 do {
237 unsigned int newheight;
225 if (!(node = radix_tree_node_alloc(root))) 238 if (!(node = radix_tree_node_alloc(root)))
226 return -ENOMEM; 239 return -ENOMEM;
227 240
228 /* Increase the height. */ 241 /* Increase the height. */
229 node->slots[0] = root->rnode; 242 node->slots[0] = radix_tree_direct_to_ptr(root->rnode);
230 243
231 /* Propagate the aggregated tag info into the new root */ 244 /* Propagate the aggregated tag info into the new root */
232 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { 245 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
@@ -234,9 +247,11 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
234 tag_set(node, tag, 0); 247 tag_set(node, tag, 0);
235 } 248 }
236 249
250 newheight = root->height+1;
251 node->height = newheight;
237 node->count = 1; 252 node->count = 1;
238 root->rnode = node; 253 rcu_assign_pointer(root->rnode, node);
239 root->height++; 254 root->height = newheight;
240 } while (height > root->height); 255 } while (height > root->height);
241out: 256out:
242 return 0; 257 return 0;
@@ -258,6 +273,8 @@ int radix_tree_insert(struct radix_tree_root *root,
258 int offset; 273 int offset;
259 int error; 274 int error;
260 275
276 BUG_ON(radix_tree_is_direct_ptr(item));
277
261 /* Make sure the tree is high enough. */ 278 /* Make sure the tree is high enough. */
262 if (index > radix_tree_maxindex(root->height)) { 279 if (index > radix_tree_maxindex(root->height)) {
263 error = radix_tree_extend(root, index); 280 error = radix_tree_extend(root, index);
@@ -275,11 +292,12 @@ int radix_tree_insert(struct radix_tree_root *root,
275 /* Have to add a child node. */ 292 /* Have to add a child node. */
276 if (!(slot = radix_tree_node_alloc(root))) 293 if (!(slot = radix_tree_node_alloc(root)))
277 return -ENOMEM; 294 return -ENOMEM;
295 slot->height = height;
278 if (node) { 296 if (node) {
279 node->slots[offset] = slot; 297 rcu_assign_pointer(node->slots[offset], slot);
280 node->count++; 298 node->count++;
281 } else 299 } else
282 root->rnode = slot; 300 rcu_assign_pointer(root->rnode, slot);
283 } 301 }
284 302
285 /* Go a level down */ 303 /* Go a level down */
@@ -295,11 +313,11 @@ int radix_tree_insert(struct radix_tree_root *root,
295 313
296 if (node) { 314 if (node) {
297 node->count++; 315 node->count++;
298 node->slots[offset] = item; 316 rcu_assign_pointer(node->slots[offset], item);
299 BUG_ON(tag_get(node, 0, offset)); 317 BUG_ON(tag_get(node, 0, offset));
300 BUG_ON(tag_get(node, 1, offset)); 318 BUG_ON(tag_get(node, 1, offset));
301 } else { 319 } else {
302 root->rnode = item; 320 rcu_assign_pointer(root->rnode, radix_tree_ptr_to_direct(item));
303 BUG_ON(root_tag_get(root, 0)); 321 BUG_ON(root_tag_get(root, 0));
304 BUG_ON(root_tag_get(root, 1)); 322 BUG_ON(root_tag_get(root, 1));
305 } 323 }
@@ -308,49 +326,54 @@ int radix_tree_insert(struct radix_tree_root *root,
308} 326}
309EXPORT_SYMBOL(radix_tree_insert); 327EXPORT_SYMBOL(radix_tree_insert);
310 328
311static inline void **__lookup_slot(struct radix_tree_root *root, 329/**
312 unsigned long index) 330 * radix_tree_lookup_slot - lookup a slot in a radix tree
331 * @root: radix tree root
332 * @index: index key
333 *
334 * Returns: the slot corresponding to the position @index in the
335 * radix tree @root. This is useful for update-if-exists operations.
336 *
337 * This function cannot be called under rcu_read_lock, it must be
338 * excluded from writers, as must the returned slot for subsequent
339 * use by radix_tree_deref_slot() and radix_tree_replace slot.
340 * Caller must hold tree write locked across slot lookup and
341 * replace.
342 */
343void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
313{ 344{
314 unsigned int height, shift; 345 unsigned int height, shift;
315 struct radix_tree_node **slot; 346 struct radix_tree_node *node, **slot;
316
317 height = root->height;
318 347
319 if (index > radix_tree_maxindex(height)) 348 node = root->rnode;
349 if (node == NULL)
320 return NULL; 350 return NULL;
321 351
322 if (height == 0 && root->rnode) 352 if (radix_tree_is_direct_ptr(node)) {
353 if (index > 0)
354 return NULL;
323 return (void **)&root->rnode; 355 return (void **)&root->rnode;
356 }
357
358 height = node->height;
359 if (index > radix_tree_maxindex(height))
360 return NULL;
324 361
325 shift = (height-1) * RADIX_TREE_MAP_SHIFT; 362 shift = (height-1) * RADIX_TREE_MAP_SHIFT;
326 slot = &root->rnode;
327 363
328 while (height > 0) { 364 do {
329 if (*slot == NULL) 365 slot = (struct radix_tree_node **)
366 (node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK));
367 node = *slot;
368 if (node == NULL)
330 return NULL; 369 return NULL;
331 370
332 slot = (struct radix_tree_node **)
333 ((*slot)->slots +
334 ((index >> shift) & RADIX_TREE_MAP_MASK));
335 shift -= RADIX_TREE_MAP_SHIFT; 371 shift -= RADIX_TREE_MAP_SHIFT;
336 height--; 372 height--;
337 } 373 } while (height > 0);
338 374
339 return (void **)slot; 375 return (void **)slot;
340} 376}
341
342/**
343 * radix_tree_lookup_slot - lookup a slot in a radix tree
344 * @root: radix tree root
345 * @index: index key
346 *
347 * Lookup the slot corresponding to the position @index in the radix tree
348 * @root. This is useful for update-if-exists operations.
349 */
350void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
351{
352 return __lookup_slot(root, index);
353}
354EXPORT_SYMBOL(radix_tree_lookup_slot); 377EXPORT_SYMBOL(radix_tree_lookup_slot);
355 378
356/** 379/**
@@ -359,13 +382,45 @@ EXPORT_SYMBOL(radix_tree_lookup_slot);
359 * @index: index key 382 * @index: index key
360 * 383 *
361 * Lookup the item at the position @index in the radix tree @root. 384 * Lookup the item at the position @index in the radix tree @root.
385 *
386 * This function can be called under rcu_read_lock, however the caller
387 * must manage lifetimes of leaf nodes (eg. RCU may also be used to free
388 * them safely). No RCU barriers are required to access or modify the
389 * returned item, however.
362 */ 390 */
363void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) 391void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
364{ 392{
365 void **slot; 393 unsigned int height, shift;
394 struct radix_tree_node *node, **slot;
395
396 node = rcu_dereference(root->rnode);
397 if (node == NULL)
398 return NULL;
399
400 if (radix_tree_is_direct_ptr(node)) {
401 if (index > 0)
402 return NULL;
403 return radix_tree_direct_to_ptr(node);
404 }
405
406 height = node->height;
407 if (index > radix_tree_maxindex(height))
408 return NULL;
409
410 shift = (height-1) * RADIX_TREE_MAP_SHIFT;
366 411
367 slot = __lookup_slot(root, index); 412 do {
368 return slot != NULL ? *slot : NULL; 413 slot = (struct radix_tree_node **)
414 (node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK));
415 node = rcu_dereference(*slot);
416 if (node == NULL)
417 return NULL;
418
419 shift -= RADIX_TREE_MAP_SHIFT;
420 height--;
421 } while (height > 0);
422
423 return node;
369} 424}
370EXPORT_SYMBOL(radix_tree_lookup); 425EXPORT_SYMBOL(radix_tree_lookup);
371 426
@@ -495,27 +550,30 @@ int radix_tree_tag_get(struct radix_tree_root *root,
495 unsigned long index, unsigned int tag) 550 unsigned long index, unsigned int tag)
496{ 551{
497 unsigned int height, shift; 552 unsigned int height, shift;
498 struct radix_tree_node *slot; 553 struct radix_tree_node *node;
499 int saw_unset_tag = 0; 554 int saw_unset_tag = 0;
500 555
501 height = root->height;
502 if (index > radix_tree_maxindex(height))
503 return 0;
504
505 /* check the root's tag bit */ 556 /* check the root's tag bit */
506 if (!root_tag_get(root, tag)) 557 if (!root_tag_get(root, tag))
507 return 0; 558 return 0;
508 559
509 if (height == 0) 560 node = rcu_dereference(root->rnode);
510 return 1; 561 if (node == NULL)
562 return 0;
563
564 if (radix_tree_is_direct_ptr(node))
565 return (index == 0);
566
567 height = node->height;
568 if (index > radix_tree_maxindex(height))
569 return 0;
511 570
512 shift = (height - 1) * RADIX_TREE_MAP_SHIFT; 571 shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
513 slot = root->rnode;
514 572
515 for ( ; ; ) { 573 for ( ; ; ) {
516 int offset; 574 int offset;
517 575
518 if (slot == NULL) 576 if (node == NULL)
519 return 0; 577 return 0;
520 578
521 offset = (index >> shift) & RADIX_TREE_MAP_MASK; 579 offset = (index >> shift) & RADIX_TREE_MAP_MASK;
@@ -524,15 +582,15 @@ int radix_tree_tag_get(struct radix_tree_root *root,
524 * This is just a debug check. Later, we can bale as soon as 582 * This is just a debug check. Later, we can bale as soon as
525 * we see an unset tag. 583 * we see an unset tag.
526 */ 584 */
527 if (!tag_get(slot, tag, offset)) 585 if (!tag_get(node, tag, offset))
528 saw_unset_tag = 1; 586 saw_unset_tag = 1;
529 if (height == 1) { 587 if (height == 1) {
530 int ret = tag_get(slot, tag, offset); 588 int ret = tag_get(node, tag, offset);
531 589
532 BUG_ON(ret && saw_unset_tag); 590 BUG_ON(ret && saw_unset_tag);
533 return !!ret; 591 return !!ret;
534 } 592 }
535 slot = slot->slots[offset]; 593 node = rcu_dereference(node->slots[offset]);
536 shift -= RADIX_TREE_MAP_SHIFT; 594 shift -= RADIX_TREE_MAP_SHIFT;
537 height--; 595 height--;
538 } 596 }
@@ -541,47 +599,45 @@ EXPORT_SYMBOL(radix_tree_tag_get);
541#endif 599#endif
542 600
543static unsigned int 601static unsigned int
544__lookup(struct radix_tree_root *root, void **results, unsigned long index, 602__lookup(struct radix_tree_node *slot, void **results, unsigned long index,
545 unsigned int max_items, unsigned long *next_index) 603 unsigned int max_items, unsigned long *next_index)
546{ 604{
547 unsigned int nr_found = 0; 605 unsigned int nr_found = 0;
548 unsigned int shift, height; 606 unsigned int shift, height;
549 struct radix_tree_node *slot;
550 unsigned long i; 607 unsigned long i;
551 608
552 height = root->height; 609 height = slot->height;
553 if (height == 0) { 610 if (height == 0)
554 if (root->rnode && index == 0)
555 results[nr_found++] = root->rnode;
556 goto out; 611 goto out;
557 }
558
559 shift = (height-1) * RADIX_TREE_MAP_SHIFT; 612 shift = (height-1) * RADIX_TREE_MAP_SHIFT;
560 slot = root->rnode;
561 613
562 for ( ; height > 1; height--) { 614 for ( ; height > 1; height--) {
563 615 i = (index >> shift) & RADIX_TREE_MAP_MASK;
564 for (i = (index >> shift) & RADIX_TREE_MAP_MASK ; 616 for (;;) {
565 i < RADIX_TREE_MAP_SIZE; i++) {
566 if (slot->slots[i] != NULL) 617 if (slot->slots[i] != NULL)
567 break; 618 break;
568 index &= ~((1UL << shift) - 1); 619 index &= ~((1UL << shift) - 1);
569 index += 1UL << shift; 620 index += 1UL << shift;
570 if (index == 0) 621 if (index == 0)
571 goto out; /* 32-bit wraparound */ 622 goto out; /* 32-bit wraparound */
623 i++;
624 if (i == RADIX_TREE_MAP_SIZE)
625 goto out;
572 } 626 }
573 if (i == RADIX_TREE_MAP_SIZE)
574 goto out;
575 627
576 shift -= RADIX_TREE_MAP_SHIFT; 628 shift -= RADIX_TREE_MAP_SHIFT;
577 slot = slot->slots[i]; 629 slot = rcu_dereference(slot->slots[i]);
630 if (slot == NULL)
631 goto out;
578 } 632 }
579 633
580 /* Bottom level: grab some items */ 634 /* Bottom level: grab some items */
581 for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) { 635 for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) {
636 struct radix_tree_node *node;
582 index++; 637 index++;
583 if (slot->slots[i]) { 638 node = slot->slots[i];
584 results[nr_found++] = slot->slots[i]; 639 if (node) {
640 results[nr_found++] = rcu_dereference(node);
585 if (nr_found == max_items) 641 if (nr_found == max_items)
586 goto out; 642 goto out;
587 } 643 }
@@ -603,28 +659,51 @@ out:
603 * *@results. 659 * *@results.
604 * 660 *
605 * The implementation is naive. 661 * The implementation is naive.
662 *
663 * Like radix_tree_lookup, radix_tree_gang_lookup may be called under
664 * rcu_read_lock. In this case, rather than the returned results being
665 * an atomic snapshot of the tree at a single point in time, the semantics
666 * of an RCU protected gang lookup are as though multiple radix_tree_lookups
667 * have been issued in individual locks, and results stored in 'results'.
606 */ 668 */
607unsigned int 669unsigned int
608radix_tree_gang_lookup(struct radix_tree_root *root, void **results, 670radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
609 unsigned long first_index, unsigned int max_items) 671 unsigned long first_index, unsigned int max_items)
610{ 672{
611 const unsigned long max_index = radix_tree_maxindex(root->height); 673 unsigned long max_index;
674 struct radix_tree_node *node;
612 unsigned long cur_index = first_index; 675 unsigned long cur_index = first_index;
613 unsigned int ret = 0; 676 unsigned int ret;
677
678 node = rcu_dereference(root->rnode);
679 if (!node)
680 return 0;
614 681
682 if (radix_tree_is_direct_ptr(node)) {
683 if (first_index > 0)
684 return 0;
685 node = radix_tree_direct_to_ptr(node);
686 results[0] = rcu_dereference(node);
687 return 1;
688 }
689
690 max_index = radix_tree_maxindex(node->height);
691
692 ret = 0;
615 while (ret < max_items) { 693 while (ret < max_items) {
616 unsigned int nr_found; 694 unsigned int nr_found;
617 unsigned long next_index; /* Index of next search */ 695 unsigned long next_index; /* Index of next search */
618 696
619 if (cur_index > max_index) 697 if (cur_index > max_index)
620 break; 698 break;
621 nr_found = __lookup(root, results + ret, cur_index, 699 nr_found = __lookup(node, results + ret, cur_index,
622 max_items - ret, &next_index); 700 max_items - ret, &next_index);
623 ret += nr_found; 701 ret += nr_found;
624 if (next_index == 0) 702 if (next_index == 0)
625 break; 703 break;
626 cur_index = next_index; 704 cur_index = next_index;
627 } 705 }
706
628 return ret; 707 return ret;
629} 708}
630EXPORT_SYMBOL(radix_tree_gang_lookup); 709EXPORT_SYMBOL(radix_tree_gang_lookup);
@@ -634,55 +713,64 @@ EXPORT_SYMBOL(radix_tree_gang_lookup);
634 * open-coding the search. 713 * open-coding the search.
635 */ 714 */
636static unsigned int 715static unsigned int
637__lookup_tag(struct radix_tree_root *root, void **results, unsigned long index, 716__lookup_tag(struct radix_tree_node *slot, void **results, unsigned long index,
638 unsigned int max_items, unsigned long *next_index, unsigned int tag) 717 unsigned int max_items, unsigned long *next_index, unsigned int tag)
639{ 718{
640 unsigned int nr_found = 0; 719 unsigned int nr_found = 0;
641 unsigned int shift; 720 unsigned int shift, height;
642 unsigned int height = root->height;
643 struct radix_tree_node *slot;
644 721
645 if (height == 0) { 722 height = slot->height;
646 if (root->rnode && index == 0) 723 if (height == 0)
647 results[nr_found++] = root->rnode;
648 goto out; 724 goto out;
649 } 725 shift = (height-1) * RADIX_TREE_MAP_SHIFT;
650
651 shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
652 slot = root->rnode;
653 726
654 do { 727 while (height > 0) {
655 unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK; 728 unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK ;
656 729
657 for ( ; i < RADIX_TREE_MAP_SIZE; i++) { 730 for (;;) {
658 if (tag_get(slot, tag, i)) { 731 if (tag_get(slot, tag, i))
659 BUG_ON(slot->slots[i] == NULL);
660 break; 732 break;
661 }
662 index &= ~((1UL << shift) - 1); 733 index &= ~((1UL << shift) - 1);
663 index += 1UL << shift; 734 index += 1UL << shift;
664 if (index == 0) 735 if (index == 0)
665 goto out; /* 32-bit wraparound */ 736 goto out; /* 32-bit wraparound */
737 i++;
738 if (i == RADIX_TREE_MAP_SIZE)
739 goto out;
666 } 740 }
667 if (i == RADIX_TREE_MAP_SIZE)
668 goto out;
669 height--; 741 height--;
670 if (height == 0) { /* Bottom level: grab some items */ 742 if (height == 0) { /* Bottom level: grab some items */
671 unsigned long j = index & RADIX_TREE_MAP_MASK; 743 unsigned long j = index & RADIX_TREE_MAP_MASK;
672 744
673 for ( ; j < RADIX_TREE_MAP_SIZE; j++) { 745 for ( ; j < RADIX_TREE_MAP_SIZE; j++) {
746 struct radix_tree_node *node;
674 index++; 747 index++;
675 if (tag_get(slot, tag, j)) { 748 if (!tag_get(slot, tag, j))
676 BUG_ON(slot->slots[j] == NULL); 749 continue;
677 results[nr_found++] = slot->slots[j]; 750 node = slot->slots[j];
751 /*
752 * Even though the tag was found set, we need to
753 * recheck that we have a non-NULL node, because
754 * if this lookup is lockless, it may have been
755 * subsequently deleted.
756 *
757 * Similar care must be taken in any place that
758 * lookup ->slots[x] without a lock (ie. can't
759 * rely on its value remaining the same).
760 */
761 if (node) {
762 node = rcu_dereference(node);
763 results[nr_found++] = node;
678 if (nr_found == max_items) 764 if (nr_found == max_items)
679 goto out; 765 goto out;
680 } 766 }
681 } 767 }
682 } 768 }
683 shift -= RADIX_TREE_MAP_SHIFT; 769 shift -= RADIX_TREE_MAP_SHIFT;
684 slot = slot->slots[i]; 770 slot = rcu_dereference(slot->slots[i]);
685 } while (height > 0); 771 if (slot == NULL)
772 break;
773 }
686out: 774out:
687 *next_index = index; 775 *next_index = index;
688 return nr_found; 776 return nr_found;
@@ -706,27 +794,44 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
706 unsigned long first_index, unsigned int max_items, 794 unsigned long first_index, unsigned int max_items,
707 unsigned int tag) 795 unsigned int tag)
708{ 796{
709 const unsigned long max_index = radix_tree_maxindex(root->height); 797 struct radix_tree_node *node;
798 unsigned long max_index;
710 unsigned long cur_index = first_index; 799 unsigned long cur_index = first_index;
711 unsigned int ret = 0; 800 unsigned int ret;
712 801
713 /* check the root's tag bit */ 802 /* check the root's tag bit */
714 if (!root_tag_get(root, tag)) 803 if (!root_tag_get(root, tag))
715 return 0; 804 return 0;
716 805
806 node = rcu_dereference(root->rnode);
807 if (!node)
808 return 0;
809
810 if (radix_tree_is_direct_ptr(node)) {
811 if (first_index > 0)
812 return 0;
813 node = radix_tree_direct_to_ptr(node);
814 results[0] = rcu_dereference(node);
815 return 1;
816 }
817
818 max_index = radix_tree_maxindex(node->height);
819
820 ret = 0;
717 while (ret < max_items) { 821 while (ret < max_items) {
718 unsigned int nr_found; 822 unsigned int nr_found;
719 unsigned long next_index; /* Index of next search */ 823 unsigned long next_index; /* Index of next search */
720 824
721 if (cur_index > max_index) 825 if (cur_index > max_index)
722 break; 826 break;
723 nr_found = __lookup_tag(root, results + ret, cur_index, 827 nr_found = __lookup_tag(node, results + ret, cur_index,
724 max_items - ret, &next_index, tag); 828 max_items - ret, &next_index, tag);
725 ret += nr_found; 829 ret += nr_found;
726 if (next_index == 0) 830 if (next_index == 0)
727 break; 831 break;
728 cur_index = next_index; 832 cur_index = next_index;
729 } 833 }
834
730 return ret; 835 return ret;
731} 836}
732EXPORT_SYMBOL(radix_tree_gang_lookup_tag); 837EXPORT_SYMBOL(radix_tree_gang_lookup_tag);
@@ -742,8 +847,19 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
742 root->rnode->count == 1 && 847 root->rnode->count == 1 &&
743 root->rnode->slots[0]) { 848 root->rnode->slots[0]) {
744 struct radix_tree_node *to_free = root->rnode; 849 struct radix_tree_node *to_free = root->rnode;
850 void *newptr;
745 851
746 root->rnode = to_free->slots[0]; 852 /*
853 * We don't need rcu_assign_pointer(), since we are simply
854 * moving the node from one part of the tree to another. If
855 * it was safe to dereference the old pointer to it
856 * (to_free->slots[0]), it will be safe to dereference the new
857 * one (root->rnode).
858 */
859 newptr = to_free->slots[0];
860 if (root->height == 1)
861 newptr = radix_tree_ptr_to_direct(newptr);
862 root->rnode = newptr;
747 root->height--; 863 root->height--;
748 /* must only free zeroed nodes into the slab */ 864 /* must only free zeroed nodes into the slab */
749 tag_clear(to_free, 0, 0); 865 tag_clear(to_free, 0, 0);
@@ -767,6 +883,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
767{ 883{
768 struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path; 884 struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
769 struct radix_tree_node *slot = NULL; 885 struct radix_tree_node *slot = NULL;
886 struct radix_tree_node *to_free;
770 unsigned int height, shift; 887 unsigned int height, shift;
771 int tag; 888 int tag;
772 int offset; 889 int offset;
@@ -777,6 +894,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
777 894
778 slot = root->rnode; 895 slot = root->rnode;
779 if (height == 0 && root->rnode) { 896 if (height == 0 && root->rnode) {
897 slot = radix_tree_direct_to_ptr(slot);
780 root_tag_clear_all(root); 898 root_tag_clear_all(root);
781 root->rnode = NULL; 899 root->rnode = NULL;
782 goto out; 900 goto out;
@@ -809,10 +927,17 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
809 radix_tree_tag_clear(root, index, tag); 927 radix_tree_tag_clear(root, index, tag);
810 } 928 }
811 929
930 to_free = NULL;
812 /* Now free the nodes we do not need anymore */ 931 /* Now free the nodes we do not need anymore */
813 while (pathp->node) { 932 while (pathp->node) {
814 pathp->node->slots[pathp->offset] = NULL; 933 pathp->node->slots[pathp->offset] = NULL;
815 pathp->node->count--; 934 pathp->node->count--;
935 /*
936 * Queue the node for deferred freeing after the
937 * last reference to it disappears (set NULL, above).
938 */
939 if (to_free)
940 radix_tree_node_free(to_free);
816 941
817 if (pathp->node->count) { 942 if (pathp->node->count) {
818 if (pathp->node == root->rnode) 943 if (pathp->node == root->rnode)
@@ -821,13 +946,15 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
821 } 946 }
822 947
823 /* Node with zero slots in use so free it */ 948 /* Node with zero slots in use so free it */
824 radix_tree_node_free(pathp->node); 949 to_free = pathp->node;
825
826 pathp--; 950 pathp--;
951
827 } 952 }
828 root_tag_clear_all(root); 953 root_tag_clear_all(root);
829 root->height = 0; 954 root->height = 0;
830 root->rnode = NULL; 955 root->rnode = NULL;
956 if (to_free)
957 radix_tree_node_free(to_free);
831 958
832out: 959out:
833 return slot; 960 return slot;
diff --git a/mm/migrate.c b/mm/migrate.c
index b4979d423d2b..e9b161bde95b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -294,7 +294,7 @@ out:
294static int migrate_page_move_mapping(struct address_space *mapping, 294static int migrate_page_move_mapping(struct address_space *mapping,
295 struct page *newpage, struct page *page) 295 struct page *newpage, struct page *page)
296{ 296{
297 struct page **radix_pointer; 297 void **pslot;
298 298
299 if (!mapping) { 299 if (!mapping) {
300 /* Anonymous page */ 300 /* Anonymous page */
@@ -305,12 +305,11 @@ static int migrate_page_move_mapping(struct address_space *mapping,
305 305
306 write_lock_irq(&mapping->tree_lock); 306 write_lock_irq(&mapping->tree_lock);
307 307
308 radix_pointer = (struct page **)radix_tree_lookup_slot( 308 pslot = radix_tree_lookup_slot(&mapping->page_tree,
309 &mapping->page_tree, 309 page_index(page));
310 page_index(page));
311 310
312 if (page_count(page) != 2 + !!PagePrivate(page) || 311 if (page_count(page) != 2 + !!PagePrivate(page) ||
313 *radix_pointer != page) { 312 (struct page *)radix_tree_deref_slot(pslot) != page) {
314 write_unlock_irq(&mapping->tree_lock); 313 write_unlock_irq(&mapping->tree_lock);
315 return -EAGAIN; 314 return -EAGAIN;
316 } 315 }
@@ -318,7 +317,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
318 /* 317 /*
319 * Now we know that no one else is looking at the page. 318 * Now we know that no one else is looking at the page.
320 */ 319 */
321 get_page(newpage); 320 get_page(newpage); /* add cache reference */
322#ifdef CONFIG_SWAP 321#ifdef CONFIG_SWAP
323 if (PageSwapCache(page)) { 322 if (PageSwapCache(page)) {
324 SetPageSwapCache(newpage); 323 SetPageSwapCache(newpage);
@@ -326,8 +325,14 @@ static int migrate_page_move_mapping(struct address_space *mapping,
326 } 325 }
327#endif 326#endif
328 327
329 *radix_pointer = newpage; 328 radix_tree_replace_slot(pslot, newpage);
329
330 /*
331 * Drop cache reference from old page.
332 * We know this isn't the last reference.
333 */
330 __put_page(page); 334 __put_page(page);
335
331 write_unlock_irq(&mapping->tree_lock); 336 write_unlock_irq(&mapping->tree_lock);
332 337
333 return 0; 338 return 0;