diff options
-rw-r--r-- | include/linux/radix-tree.h | 101 | ||||
-rw-r--r-- | lib/radix-tree.c | 327 | ||||
-rw-r--r-- | mm/migrate.c | 19 |
3 files changed, 340 insertions, 107 deletions
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index cbfa11537421..0deb842541ac 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h | |||
@@ -1,6 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (C) 2001 Momchil Velikov | 2 | * Copyright (C) 2001 Momchil Velikov |
3 | * Portions Copyright (C) 2001 Christoph Hellwig | 3 | * Portions Copyright (C) 2001 Christoph Hellwig |
4 | * Copyright (C) 2006 Nick Piggin | ||
4 | * | 5 | * |
5 | * This program is free software; you can redistribute it and/or | 6 | * This program is free software; you can redistribute it and/or |
6 | * modify it under the terms of the GNU General Public License as | 7 | * modify it under the terms of the GNU General Public License as |
@@ -21,6 +22,35 @@ | |||
21 | 22 | ||
22 | #include <linux/preempt.h> | 23 | #include <linux/preempt.h> |
23 | #include <linux/types.h> | 24 | #include <linux/types.h> |
25 | #include <linux/kernel.h> | ||
26 | #include <linux/rcupdate.h> | ||
27 | |||
28 | /* | ||
29 | * A direct pointer (root->rnode pointing directly to a data item, | ||
30 | * rather than another radix_tree_node) is signalled by the low bit | ||
31 | * set in the root->rnode pointer. | ||
32 | * | ||
33 | * In this case root->height is also NULL, but the direct pointer tests are | ||
34 | * needed for RCU lookups when root->height is unreliable. | ||
35 | */ | ||
36 | #define RADIX_TREE_DIRECT_PTR 1 | ||
37 | |||
38 | static inline void *radix_tree_ptr_to_direct(void *ptr) | ||
39 | { | ||
40 | return (void *)((unsigned long)ptr | RADIX_TREE_DIRECT_PTR); | ||
41 | } | ||
42 | |||
43 | static inline void *radix_tree_direct_to_ptr(void *ptr) | ||
44 | { | ||
45 | return (void *)((unsigned long)ptr & ~RADIX_TREE_DIRECT_PTR); | ||
46 | } | ||
47 | |||
48 | static inline int radix_tree_is_direct_ptr(void *ptr) | ||
49 | { | ||
50 | return (int)((unsigned long)ptr & RADIX_TREE_DIRECT_PTR); | ||
51 | } | ||
52 | |||
53 | /*** radix-tree API starts here ***/ | ||
24 | 54 | ||
25 | #define RADIX_TREE_MAX_TAGS 2 | 55 | #define RADIX_TREE_MAX_TAGS 2 |
26 | 56 | ||
@@ -47,6 +77,77 @@ do { \ | |||
47 | (root)->rnode = NULL; \ | 77 | (root)->rnode = NULL; \ |
48 | } while (0) | 78 | } while (0) |
49 | 79 | ||
80 | /** | ||
81 | * Radix-tree synchronization | ||
82 | * | ||
83 | * The radix-tree API requires that users provide all synchronisation (with | ||
84 | * specific exceptions, noted below). | ||
85 | * | ||
86 | * Synchronization of access to the data items being stored in the tree, and | ||
87 | * management of their lifetimes must be completely managed by API users. | ||
88 | * | ||
89 | * For API usage, in general, | ||
90 | * - any function _modifying_ the the tree or tags (inserting or deleting | ||
91 | * items, setting or clearing tags must exclude other modifications, and | ||
92 | * exclude any functions reading the tree. | ||
93 | * - any function _reading_ the the tree or tags (looking up items or tags, | ||
94 | * gang lookups) must exclude modifications to the tree, but may occur | ||
95 | * concurrently with other readers. | ||
96 | * | ||
97 | * The notable exceptions to this rule are the following functions: | ||
98 | * radix_tree_lookup | ||
99 | * radix_tree_tag_get | ||
100 | * radix_tree_gang_lookup | ||
101 | * radix_tree_gang_lookup_tag | ||
102 | * radix_tree_tagged | ||
103 | * | ||
104 | * The first 4 functions are able to be called locklessly, using RCU. The | ||
105 | * caller must ensure calls to these functions are made within rcu_read_lock() | ||
106 | * regions. Other readers (lock-free or otherwise) and modifications may be | ||
107 | * running concurrently. | ||
108 | * | ||
109 | * It is still required that the caller manage the synchronization and lifetimes | ||
110 | * of the items. So if RCU lock-free lookups are used, typically this would mean | ||
111 | * that the items have their own locks, or are amenable to lock-free access; and | ||
112 | * that the items are freed by RCU (or only freed after having been deleted from | ||
113 | * the radix tree *and* a synchronize_rcu() grace period). | ||
114 | * | ||
115 | * (Note, rcu_assign_pointer and rcu_dereference are not needed to control | ||
116 | * access to data items when inserting into or looking up from the radix tree) | ||
117 | * | ||
118 | * radix_tree_tagged is able to be called without locking or RCU. | ||
119 | */ | ||
120 | |||
121 | /** | ||
122 | * radix_tree_deref_slot - dereference a slot | ||
123 | * @pslot: pointer to slot, returned by radix_tree_lookup_slot | ||
124 | * Returns: item that was stored in that slot with any direct pointer flag | ||
125 | * removed. | ||
126 | * | ||
127 | * For use with radix_tree_lookup_slot(). Caller must hold tree at least read | ||
128 | * locked across slot lookup and dereference. More likely, will be used with | ||
129 | * radix_tree_replace_slot(), as well, so caller will hold tree write locked. | ||
130 | */ | ||
131 | static inline void *radix_tree_deref_slot(void **pslot) | ||
132 | { | ||
133 | return radix_tree_direct_to_ptr(*pslot); | ||
134 | } | ||
135 | /** | ||
136 | * radix_tree_replace_slot - replace item in a slot | ||
137 | * @pslot: pointer to slot, returned by radix_tree_lookup_slot | ||
138 | * @item: new item to store in the slot. | ||
139 | * | ||
140 | * For use with radix_tree_lookup_slot(). Caller must hold tree write locked | ||
141 | * across slot lookup and replacement. | ||
142 | */ | ||
143 | static inline void radix_tree_replace_slot(void **pslot, void *item) | ||
144 | { | ||
145 | BUG_ON(radix_tree_is_direct_ptr(item)); | ||
146 | rcu_assign_pointer(*pslot, | ||
147 | (void *)((unsigned long)item | | ||
148 | ((unsigned long)*pslot & RADIX_TREE_DIRECT_PTR))); | ||
149 | } | ||
150 | |||
50 | int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); | 151 | int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); |
51 | void *radix_tree_lookup(struct radix_tree_root *, unsigned long); | 152 | void *radix_tree_lookup(struct radix_tree_root *, unsigned long); |
52 | void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); | 153 | void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); |
diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 9eb25955019f..e2cefabb5aa0 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c | |||
@@ -2,6 +2,7 @@ | |||
2 | * Copyright (C) 2001 Momchil Velikov | 2 | * Copyright (C) 2001 Momchil Velikov |
3 | * Portions Copyright (C) 2001 Christoph Hellwig | 3 | * Portions Copyright (C) 2001 Christoph Hellwig |
4 | * Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com> | 4 | * Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com> |
5 | * Copyright (C) 2006 Nick Piggin | ||
5 | * | 6 | * |
6 | * This program is free software; you can redistribute it and/or | 7 | * This program is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU General Public License as | 8 | * modify it under the terms of the GNU General Public License as |
@@ -30,6 +31,7 @@ | |||
30 | #include <linux/gfp.h> | 31 | #include <linux/gfp.h> |
31 | #include <linux/string.h> | 32 | #include <linux/string.h> |
32 | #include <linux/bitops.h> | 33 | #include <linux/bitops.h> |
34 | #include <linux/rcupdate.h> | ||
33 | 35 | ||
34 | 36 | ||
35 | #ifdef __KERNEL__ | 37 | #ifdef __KERNEL__ |
@@ -45,7 +47,9 @@ | |||
45 | ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) | 47 | ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) |
46 | 48 | ||
47 | struct radix_tree_node { | 49 | struct radix_tree_node { |
50 | unsigned int height; /* Height from the bottom */ | ||
48 | unsigned int count; | 51 | unsigned int count; |
52 | struct rcu_head rcu_head; | ||
49 | void *slots[RADIX_TREE_MAP_SIZE]; | 53 | void *slots[RADIX_TREE_MAP_SIZE]; |
50 | unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; | 54 | unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; |
51 | }; | 55 | }; |
@@ -100,13 +104,21 @@ radix_tree_node_alloc(struct radix_tree_root *root) | |||
100 | rtp->nr--; | 104 | rtp->nr--; |
101 | } | 105 | } |
102 | } | 106 | } |
107 | BUG_ON(radix_tree_is_direct_ptr(ret)); | ||
103 | return ret; | 108 | return ret; |
104 | } | 109 | } |
105 | 110 | ||
111 | static void radix_tree_node_rcu_free(struct rcu_head *head) | ||
112 | { | ||
113 | struct radix_tree_node *node = | ||
114 | container_of(head, struct radix_tree_node, rcu_head); | ||
115 | kmem_cache_free(radix_tree_node_cachep, node); | ||
116 | } | ||
117 | |||
106 | static inline void | 118 | static inline void |
107 | radix_tree_node_free(struct radix_tree_node *node) | 119 | radix_tree_node_free(struct radix_tree_node *node) |
108 | { | 120 | { |
109 | kmem_cache_free(radix_tree_node_cachep, node); | 121 | call_rcu(&node->rcu_head, radix_tree_node_rcu_free); |
110 | } | 122 | } |
111 | 123 | ||
112 | /* | 124 | /* |
@@ -222,11 +234,12 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) | |||
222 | } | 234 | } |
223 | 235 | ||
224 | do { | 236 | do { |
237 | unsigned int newheight; | ||
225 | if (!(node = radix_tree_node_alloc(root))) | 238 | if (!(node = radix_tree_node_alloc(root))) |
226 | return -ENOMEM; | 239 | return -ENOMEM; |
227 | 240 | ||
228 | /* Increase the height. */ | 241 | /* Increase the height. */ |
229 | node->slots[0] = root->rnode; | 242 | node->slots[0] = radix_tree_direct_to_ptr(root->rnode); |
230 | 243 | ||
231 | /* Propagate the aggregated tag info into the new root */ | 244 | /* Propagate the aggregated tag info into the new root */ |
232 | for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { | 245 | for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { |
@@ -234,9 +247,11 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) | |||
234 | tag_set(node, tag, 0); | 247 | tag_set(node, tag, 0); |
235 | } | 248 | } |
236 | 249 | ||
250 | newheight = root->height+1; | ||
251 | node->height = newheight; | ||
237 | node->count = 1; | 252 | node->count = 1; |
238 | root->rnode = node; | 253 | rcu_assign_pointer(root->rnode, node); |
239 | root->height++; | 254 | root->height = newheight; |
240 | } while (height > root->height); | 255 | } while (height > root->height); |
241 | out: | 256 | out: |
242 | return 0; | 257 | return 0; |
@@ -258,6 +273,8 @@ int radix_tree_insert(struct radix_tree_root *root, | |||
258 | int offset; | 273 | int offset; |
259 | int error; | 274 | int error; |
260 | 275 | ||
276 | BUG_ON(radix_tree_is_direct_ptr(item)); | ||
277 | |||
261 | /* Make sure the tree is high enough. */ | 278 | /* Make sure the tree is high enough. */ |
262 | if (index > radix_tree_maxindex(root->height)) { | 279 | if (index > radix_tree_maxindex(root->height)) { |
263 | error = radix_tree_extend(root, index); | 280 | error = radix_tree_extend(root, index); |
@@ -275,11 +292,12 @@ int radix_tree_insert(struct radix_tree_root *root, | |||
275 | /* Have to add a child node. */ | 292 | /* Have to add a child node. */ |
276 | if (!(slot = radix_tree_node_alloc(root))) | 293 | if (!(slot = radix_tree_node_alloc(root))) |
277 | return -ENOMEM; | 294 | return -ENOMEM; |
295 | slot->height = height; | ||
278 | if (node) { | 296 | if (node) { |
279 | node->slots[offset] = slot; | 297 | rcu_assign_pointer(node->slots[offset], slot); |
280 | node->count++; | 298 | node->count++; |
281 | } else | 299 | } else |
282 | root->rnode = slot; | 300 | rcu_assign_pointer(root->rnode, slot); |
283 | } | 301 | } |
284 | 302 | ||
285 | /* Go a level down */ | 303 | /* Go a level down */ |
@@ -295,11 +313,11 @@ int radix_tree_insert(struct radix_tree_root *root, | |||
295 | 313 | ||
296 | if (node) { | 314 | if (node) { |
297 | node->count++; | 315 | node->count++; |
298 | node->slots[offset] = item; | 316 | rcu_assign_pointer(node->slots[offset], item); |
299 | BUG_ON(tag_get(node, 0, offset)); | 317 | BUG_ON(tag_get(node, 0, offset)); |
300 | BUG_ON(tag_get(node, 1, offset)); | 318 | BUG_ON(tag_get(node, 1, offset)); |
301 | } else { | 319 | } else { |
302 | root->rnode = item; | 320 | rcu_assign_pointer(root->rnode, radix_tree_ptr_to_direct(item)); |
303 | BUG_ON(root_tag_get(root, 0)); | 321 | BUG_ON(root_tag_get(root, 0)); |
304 | BUG_ON(root_tag_get(root, 1)); | 322 | BUG_ON(root_tag_get(root, 1)); |
305 | } | 323 | } |
@@ -308,49 +326,54 @@ int radix_tree_insert(struct radix_tree_root *root, | |||
308 | } | 326 | } |
309 | EXPORT_SYMBOL(radix_tree_insert); | 327 | EXPORT_SYMBOL(radix_tree_insert); |
310 | 328 | ||
311 | static inline void **__lookup_slot(struct radix_tree_root *root, | 329 | /** |
312 | unsigned long index) | 330 | * radix_tree_lookup_slot - lookup a slot in a radix tree |
331 | * @root: radix tree root | ||
332 | * @index: index key | ||
333 | * | ||
334 | * Returns: the slot corresponding to the position @index in the | ||
335 | * radix tree @root. This is useful for update-if-exists operations. | ||
336 | * | ||
337 | * This function cannot be called under rcu_read_lock, it must be | ||
338 | * excluded from writers, as must the returned slot for subsequent | ||
339 | * use by radix_tree_deref_slot() and radix_tree_replace slot. | ||
340 | * Caller must hold tree write locked across slot lookup and | ||
341 | * replace. | ||
342 | */ | ||
343 | void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index) | ||
313 | { | 344 | { |
314 | unsigned int height, shift; | 345 | unsigned int height, shift; |
315 | struct radix_tree_node **slot; | 346 | struct radix_tree_node *node, **slot; |
316 | |||
317 | height = root->height; | ||
318 | 347 | ||
319 | if (index > radix_tree_maxindex(height)) | 348 | node = root->rnode; |
349 | if (node == NULL) | ||
320 | return NULL; | 350 | return NULL; |
321 | 351 | ||
322 | if (height == 0 && root->rnode) | 352 | if (radix_tree_is_direct_ptr(node)) { |
353 | if (index > 0) | ||
354 | return NULL; | ||
323 | return (void **)&root->rnode; | 355 | return (void **)&root->rnode; |
356 | } | ||
357 | |||
358 | height = node->height; | ||
359 | if (index > radix_tree_maxindex(height)) | ||
360 | return NULL; | ||
324 | 361 | ||
325 | shift = (height-1) * RADIX_TREE_MAP_SHIFT; | 362 | shift = (height-1) * RADIX_TREE_MAP_SHIFT; |
326 | slot = &root->rnode; | ||
327 | 363 | ||
328 | while (height > 0) { | 364 | do { |
329 | if (*slot == NULL) | 365 | slot = (struct radix_tree_node **) |
366 | (node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK)); | ||
367 | node = *slot; | ||
368 | if (node == NULL) | ||
330 | return NULL; | 369 | return NULL; |
331 | 370 | ||
332 | slot = (struct radix_tree_node **) | ||
333 | ((*slot)->slots + | ||
334 | ((index >> shift) & RADIX_TREE_MAP_MASK)); | ||
335 | shift -= RADIX_TREE_MAP_SHIFT; | 371 | shift -= RADIX_TREE_MAP_SHIFT; |
336 | height--; | 372 | height--; |
337 | } | 373 | } while (height > 0); |
338 | 374 | ||
339 | return (void **)slot; | 375 | return (void **)slot; |
340 | } | 376 | } |
341 | |||
342 | /** | ||
343 | * radix_tree_lookup_slot - lookup a slot in a radix tree | ||
344 | * @root: radix tree root | ||
345 | * @index: index key | ||
346 | * | ||
347 | * Lookup the slot corresponding to the position @index in the radix tree | ||
348 | * @root. This is useful for update-if-exists operations. | ||
349 | */ | ||
350 | void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index) | ||
351 | { | ||
352 | return __lookup_slot(root, index); | ||
353 | } | ||
354 | EXPORT_SYMBOL(radix_tree_lookup_slot); | 377 | EXPORT_SYMBOL(radix_tree_lookup_slot); |
355 | 378 | ||
356 | /** | 379 | /** |
@@ -359,13 +382,45 @@ EXPORT_SYMBOL(radix_tree_lookup_slot); | |||
359 | * @index: index key | 382 | * @index: index key |
360 | * | 383 | * |
361 | * Lookup the item at the position @index in the radix tree @root. | 384 | * Lookup the item at the position @index in the radix tree @root. |
385 | * | ||
386 | * This function can be called under rcu_read_lock, however the caller | ||
387 | * must manage lifetimes of leaf nodes (eg. RCU may also be used to free | ||
388 | * them safely). No RCU barriers are required to access or modify the | ||
389 | * returned item, however. | ||
362 | */ | 390 | */ |
363 | void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) | 391 | void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) |
364 | { | 392 | { |
365 | void **slot; | 393 | unsigned int height, shift; |
394 | struct radix_tree_node *node, **slot; | ||
395 | |||
396 | node = rcu_dereference(root->rnode); | ||
397 | if (node == NULL) | ||
398 | return NULL; | ||
399 | |||
400 | if (radix_tree_is_direct_ptr(node)) { | ||
401 | if (index > 0) | ||
402 | return NULL; | ||
403 | return radix_tree_direct_to_ptr(node); | ||
404 | } | ||
405 | |||
406 | height = node->height; | ||
407 | if (index > radix_tree_maxindex(height)) | ||
408 | return NULL; | ||
409 | |||
410 | shift = (height-1) * RADIX_TREE_MAP_SHIFT; | ||
366 | 411 | ||
367 | slot = __lookup_slot(root, index); | 412 | do { |
368 | return slot != NULL ? *slot : NULL; | 413 | slot = (struct radix_tree_node **) |
414 | (node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK)); | ||
415 | node = rcu_dereference(*slot); | ||
416 | if (node == NULL) | ||
417 | return NULL; | ||
418 | |||
419 | shift -= RADIX_TREE_MAP_SHIFT; | ||
420 | height--; | ||
421 | } while (height > 0); | ||
422 | |||
423 | return node; | ||
369 | } | 424 | } |
370 | EXPORT_SYMBOL(radix_tree_lookup); | 425 | EXPORT_SYMBOL(radix_tree_lookup); |
371 | 426 | ||
@@ -495,27 +550,30 @@ int radix_tree_tag_get(struct radix_tree_root *root, | |||
495 | unsigned long index, unsigned int tag) | 550 | unsigned long index, unsigned int tag) |
496 | { | 551 | { |
497 | unsigned int height, shift; | 552 | unsigned int height, shift; |
498 | struct radix_tree_node *slot; | 553 | struct radix_tree_node *node; |
499 | int saw_unset_tag = 0; | 554 | int saw_unset_tag = 0; |
500 | 555 | ||
501 | height = root->height; | ||
502 | if (index > radix_tree_maxindex(height)) | ||
503 | return 0; | ||
504 | |||
505 | /* check the root's tag bit */ | 556 | /* check the root's tag bit */ |
506 | if (!root_tag_get(root, tag)) | 557 | if (!root_tag_get(root, tag)) |
507 | return 0; | 558 | return 0; |
508 | 559 | ||
509 | if (height == 0) | 560 | node = rcu_dereference(root->rnode); |
510 | return 1; | 561 | if (node == NULL) |
562 | return 0; | ||
563 | |||
564 | if (radix_tree_is_direct_ptr(node)) | ||
565 | return (index == 0); | ||
566 | |||
567 | height = node->height; | ||
568 | if (index > radix_tree_maxindex(height)) | ||
569 | return 0; | ||
511 | 570 | ||
512 | shift = (height - 1) * RADIX_TREE_MAP_SHIFT; | 571 | shift = (height - 1) * RADIX_TREE_MAP_SHIFT; |
513 | slot = root->rnode; | ||
514 | 572 | ||
515 | for ( ; ; ) { | 573 | for ( ; ; ) { |
516 | int offset; | 574 | int offset; |
517 | 575 | ||
518 | if (slot == NULL) | 576 | if (node == NULL) |
519 | return 0; | 577 | return 0; |
520 | 578 | ||
521 | offset = (index >> shift) & RADIX_TREE_MAP_MASK; | 579 | offset = (index >> shift) & RADIX_TREE_MAP_MASK; |
@@ -524,15 +582,15 @@ int radix_tree_tag_get(struct radix_tree_root *root, | |||
524 | * This is just a debug check. Later, we can bale as soon as | 582 | * This is just a debug check. Later, we can bale as soon as |
525 | * we see an unset tag. | 583 | * we see an unset tag. |
526 | */ | 584 | */ |
527 | if (!tag_get(slot, tag, offset)) | 585 | if (!tag_get(node, tag, offset)) |
528 | saw_unset_tag = 1; | 586 | saw_unset_tag = 1; |
529 | if (height == 1) { | 587 | if (height == 1) { |
530 | int ret = tag_get(slot, tag, offset); | 588 | int ret = tag_get(node, tag, offset); |
531 | 589 | ||
532 | BUG_ON(ret && saw_unset_tag); | 590 | BUG_ON(ret && saw_unset_tag); |
533 | return !!ret; | 591 | return !!ret; |
534 | } | 592 | } |
535 | slot = slot->slots[offset]; | 593 | node = rcu_dereference(node->slots[offset]); |
536 | shift -= RADIX_TREE_MAP_SHIFT; | 594 | shift -= RADIX_TREE_MAP_SHIFT; |
537 | height--; | 595 | height--; |
538 | } | 596 | } |
@@ -541,47 +599,45 @@ EXPORT_SYMBOL(radix_tree_tag_get); | |||
541 | #endif | 599 | #endif |
542 | 600 | ||
543 | static unsigned int | 601 | static unsigned int |
544 | __lookup(struct radix_tree_root *root, void **results, unsigned long index, | 602 | __lookup(struct radix_tree_node *slot, void **results, unsigned long index, |
545 | unsigned int max_items, unsigned long *next_index) | 603 | unsigned int max_items, unsigned long *next_index) |
546 | { | 604 | { |
547 | unsigned int nr_found = 0; | 605 | unsigned int nr_found = 0; |
548 | unsigned int shift, height; | 606 | unsigned int shift, height; |
549 | struct radix_tree_node *slot; | ||
550 | unsigned long i; | 607 | unsigned long i; |
551 | 608 | ||
552 | height = root->height; | 609 | height = slot->height; |
553 | if (height == 0) { | 610 | if (height == 0) |
554 | if (root->rnode && index == 0) | ||
555 | results[nr_found++] = root->rnode; | ||
556 | goto out; | 611 | goto out; |
557 | } | ||
558 | |||
559 | shift = (height-1) * RADIX_TREE_MAP_SHIFT; | 612 | shift = (height-1) * RADIX_TREE_MAP_SHIFT; |
560 | slot = root->rnode; | ||
561 | 613 | ||
562 | for ( ; height > 1; height--) { | 614 | for ( ; height > 1; height--) { |
563 | 615 | i = (index >> shift) & RADIX_TREE_MAP_MASK; | |
564 | for (i = (index >> shift) & RADIX_TREE_MAP_MASK ; | 616 | for (;;) { |
565 | i < RADIX_TREE_MAP_SIZE; i++) { | ||
566 | if (slot->slots[i] != NULL) | 617 | if (slot->slots[i] != NULL) |
567 | break; | 618 | break; |
568 | index &= ~((1UL << shift) - 1); | 619 | index &= ~((1UL << shift) - 1); |
569 | index += 1UL << shift; | 620 | index += 1UL << shift; |
570 | if (index == 0) | 621 | if (index == 0) |
571 | goto out; /* 32-bit wraparound */ | 622 | goto out; /* 32-bit wraparound */ |
623 | i++; | ||
624 | if (i == RADIX_TREE_MAP_SIZE) | ||
625 | goto out; | ||
572 | } | 626 | } |
573 | if (i == RADIX_TREE_MAP_SIZE) | ||
574 | goto out; | ||
575 | 627 | ||
576 | shift -= RADIX_TREE_MAP_SHIFT; | 628 | shift -= RADIX_TREE_MAP_SHIFT; |
577 | slot = slot->slots[i]; | 629 | slot = rcu_dereference(slot->slots[i]); |
630 | if (slot == NULL) | ||
631 | goto out; | ||
578 | } | 632 | } |
579 | 633 | ||
580 | /* Bottom level: grab some items */ | 634 | /* Bottom level: grab some items */ |
581 | for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) { | 635 | for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) { |
636 | struct radix_tree_node *node; | ||
582 | index++; | 637 | index++; |
583 | if (slot->slots[i]) { | 638 | node = slot->slots[i]; |
584 | results[nr_found++] = slot->slots[i]; | 639 | if (node) { |
640 | results[nr_found++] = rcu_dereference(node); | ||
585 | if (nr_found == max_items) | 641 | if (nr_found == max_items) |
586 | goto out; | 642 | goto out; |
587 | } | 643 | } |
@@ -603,28 +659,51 @@ out: | |||
603 | * *@results. | 659 | * *@results. |
604 | * | 660 | * |
605 | * The implementation is naive. | 661 | * The implementation is naive. |
662 | * | ||
663 | * Like radix_tree_lookup, radix_tree_gang_lookup may be called under | ||
664 | * rcu_read_lock. In this case, rather than the returned results being | ||
665 | * an atomic snapshot of the tree at a single point in time, the semantics | ||
666 | * of an RCU protected gang lookup are as though multiple radix_tree_lookups | ||
667 | * have been issued in individual locks, and results stored in 'results'. | ||
606 | */ | 668 | */ |
607 | unsigned int | 669 | unsigned int |
608 | radix_tree_gang_lookup(struct radix_tree_root *root, void **results, | 670 | radix_tree_gang_lookup(struct radix_tree_root *root, void **results, |
609 | unsigned long first_index, unsigned int max_items) | 671 | unsigned long first_index, unsigned int max_items) |
610 | { | 672 | { |
611 | const unsigned long max_index = radix_tree_maxindex(root->height); | 673 | unsigned long max_index; |
674 | struct radix_tree_node *node; | ||
612 | unsigned long cur_index = first_index; | 675 | unsigned long cur_index = first_index; |
613 | unsigned int ret = 0; | 676 | unsigned int ret; |
677 | |||
678 | node = rcu_dereference(root->rnode); | ||
679 | if (!node) | ||
680 | return 0; | ||
614 | 681 | ||
682 | if (radix_tree_is_direct_ptr(node)) { | ||
683 | if (first_index > 0) | ||
684 | return 0; | ||
685 | node = radix_tree_direct_to_ptr(node); | ||
686 | results[0] = rcu_dereference(node); | ||
687 | return 1; | ||
688 | } | ||
689 | |||
690 | max_index = radix_tree_maxindex(node->height); | ||
691 | |||
692 | ret = 0; | ||
615 | while (ret < max_items) { | 693 | while (ret < max_items) { |
616 | unsigned int nr_found; | 694 | unsigned int nr_found; |
617 | unsigned long next_index; /* Index of next search */ | 695 | unsigned long next_index; /* Index of next search */ |
618 | 696 | ||
619 | if (cur_index > max_index) | 697 | if (cur_index > max_index) |
620 | break; | 698 | break; |
621 | nr_found = __lookup(root, results + ret, cur_index, | 699 | nr_found = __lookup(node, results + ret, cur_index, |
622 | max_items - ret, &next_index); | 700 | max_items - ret, &next_index); |
623 | ret += nr_found; | 701 | ret += nr_found; |
624 | if (next_index == 0) | 702 | if (next_index == 0) |
625 | break; | 703 | break; |
626 | cur_index = next_index; | 704 | cur_index = next_index; |
627 | } | 705 | } |
706 | |||
628 | return ret; | 707 | return ret; |
629 | } | 708 | } |
630 | EXPORT_SYMBOL(radix_tree_gang_lookup); | 709 | EXPORT_SYMBOL(radix_tree_gang_lookup); |
@@ -634,55 +713,64 @@ EXPORT_SYMBOL(radix_tree_gang_lookup); | |||
634 | * open-coding the search. | 713 | * open-coding the search. |
635 | */ | 714 | */ |
636 | static unsigned int | 715 | static unsigned int |
637 | __lookup_tag(struct radix_tree_root *root, void **results, unsigned long index, | 716 | __lookup_tag(struct radix_tree_node *slot, void **results, unsigned long index, |
638 | unsigned int max_items, unsigned long *next_index, unsigned int tag) | 717 | unsigned int max_items, unsigned long *next_index, unsigned int tag) |
639 | { | 718 | { |
640 | unsigned int nr_found = 0; | 719 | unsigned int nr_found = 0; |
641 | unsigned int shift; | 720 | unsigned int shift, height; |
642 | unsigned int height = root->height; | ||
643 | struct radix_tree_node *slot; | ||
644 | 721 | ||
645 | if (height == 0) { | 722 | height = slot->height; |
646 | if (root->rnode && index == 0) | 723 | if (height == 0) |
647 | results[nr_found++] = root->rnode; | ||
648 | goto out; | 724 | goto out; |
649 | } | 725 | shift = (height-1) * RADIX_TREE_MAP_SHIFT; |
650 | |||
651 | shift = (height - 1) * RADIX_TREE_MAP_SHIFT; | ||
652 | slot = root->rnode; | ||
653 | 726 | ||
654 | do { | 727 | while (height > 0) { |
655 | unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK; | 728 | unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK ; |
656 | 729 | ||
657 | for ( ; i < RADIX_TREE_MAP_SIZE; i++) { | 730 | for (;;) { |
658 | if (tag_get(slot, tag, i)) { | 731 | if (tag_get(slot, tag, i)) |
659 | BUG_ON(slot->slots[i] == NULL); | ||
660 | break; | 732 | break; |
661 | } | ||
662 | index &= ~((1UL << shift) - 1); | 733 | index &= ~((1UL << shift) - 1); |
663 | index += 1UL << shift; | 734 | index += 1UL << shift; |
664 | if (index == 0) | 735 | if (index == 0) |
665 | goto out; /* 32-bit wraparound */ | 736 | goto out; /* 32-bit wraparound */ |
737 | i++; | ||
738 | if (i == RADIX_TREE_MAP_SIZE) | ||
739 | goto out; | ||
666 | } | 740 | } |
667 | if (i == RADIX_TREE_MAP_SIZE) | ||
668 | goto out; | ||
669 | height--; | 741 | height--; |
670 | if (height == 0) { /* Bottom level: grab some items */ | 742 | if (height == 0) { /* Bottom level: grab some items */ |
671 | unsigned long j = index & RADIX_TREE_MAP_MASK; | 743 | unsigned long j = index & RADIX_TREE_MAP_MASK; |
672 | 744 | ||
673 | for ( ; j < RADIX_TREE_MAP_SIZE; j++) { | 745 | for ( ; j < RADIX_TREE_MAP_SIZE; j++) { |
746 | struct radix_tree_node *node; | ||
674 | index++; | 747 | index++; |
675 | if (tag_get(slot, tag, j)) { | 748 | if (!tag_get(slot, tag, j)) |
676 | BUG_ON(slot->slots[j] == NULL); | 749 | continue; |
677 | results[nr_found++] = slot->slots[j]; | 750 | node = slot->slots[j]; |
751 | /* | ||
752 | * Even though the tag was found set, we need to | ||
753 | * recheck that we have a non-NULL node, because | ||
754 | * if this lookup is lockless, it may have been | ||
755 | * subsequently deleted. | ||
756 | * | ||
757 | * Similar care must be taken in any place that | ||
758 | * lookup ->slots[x] without a lock (ie. can't | ||
759 | * rely on its value remaining the same). | ||
760 | */ | ||
761 | if (node) { | ||
762 | node = rcu_dereference(node); | ||
763 | results[nr_found++] = node; | ||
678 | if (nr_found == max_items) | 764 | if (nr_found == max_items) |
679 | goto out; | 765 | goto out; |
680 | } | 766 | } |
681 | } | 767 | } |
682 | } | 768 | } |
683 | shift -= RADIX_TREE_MAP_SHIFT; | 769 | shift -= RADIX_TREE_MAP_SHIFT; |
684 | slot = slot->slots[i]; | 770 | slot = rcu_dereference(slot->slots[i]); |
685 | } while (height > 0); | 771 | if (slot == NULL) |
772 | break; | ||
773 | } | ||
686 | out: | 774 | out: |
687 | *next_index = index; | 775 | *next_index = index; |
688 | return nr_found; | 776 | return nr_found; |
@@ -706,27 +794,44 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, | |||
706 | unsigned long first_index, unsigned int max_items, | 794 | unsigned long first_index, unsigned int max_items, |
707 | unsigned int tag) | 795 | unsigned int tag) |
708 | { | 796 | { |
709 | const unsigned long max_index = radix_tree_maxindex(root->height); | 797 | struct radix_tree_node *node; |
798 | unsigned long max_index; | ||
710 | unsigned long cur_index = first_index; | 799 | unsigned long cur_index = first_index; |
711 | unsigned int ret = 0; | 800 | unsigned int ret; |
712 | 801 | ||
713 | /* check the root's tag bit */ | 802 | /* check the root's tag bit */ |
714 | if (!root_tag_get(root, tag)) | 803 | if (!root_tag_get(root, tag)) |
715 | return 0; | 804 | return 0; |
716 | 805 | ||
806 | node = rcu_dereference(root->rnode); | ||
807 | if (!node) | ||
808 | return 0; | ||
809 | |||
810 | if (radix_tree_is_direct_ptr(node)) { | ||
811 | if (first_index > 0) | ||
812 | return 0; | ||
813 | node = radix_tree_direct_to_ptr(node); | ||
814 | results[0] = rcu_dereference(node); | ||
815 | return 1; | ||
816 | } | ||
817 | |||
818 | max_index = radix_tree_maxindex(node->height); | ||
819 | |||
820 | ret = 0; | ||
717 | while (ret < max_items) { | 821 | while (ret < max_items) { |
718 | unsigned int nr_found; | 822 | unsigned int nr_found; |
719 | unsigned long next_index; /* Index of next search */ | 823 | unsigned long next_index; /* Index of next search */ |
720 | 824 | ||
721 | if (cur_index > max_index) | 825 | if (cur_index > max_index) |
722 | break; | 826 | break; |
723 | nr_found = __lookup_tag(root, results + ret, cur_index, | 827 | nr_found = __lookup_tag(node, results + ret, cur_index, |
724 | max_items - ret, &next_index, tag); | 828 | max_items - ret, &next_index, tag); |
725 | ret += nr_found; | 829 | ret += nr_found; |
726 | if (next_index == 0) | 830 | if (next_index == 0) |
727 | break; | 831 | break; |
728 | cur_index = next_index; | 832 | cur_index = next_index; |
729 | } | 833 | } |
834 | |||
730 | return ret; | 835 | return ret; |
731 | } | 836 | } |
732 | EXPORT_SYMBOL(radix_tree_gang_lookup_tag); | 837 | EXPORT_SYMBOL(radix_tree_gang_lookup_tag); |
@@ -742,8 +847,19 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) | |||
742 | root->rnode->count == 1 && | 847 | root->rnode->count == 1 && |
743 | root->rnode->slots[0]) { | 848 | root->rnode->slots[0]) { |
744 | struct radix_tree_node *to_free = root->rnode; | 849 | struct radix_tree_node *to_free = root->rnode; |
850 | void *newptr; | ||
745 | 851 | ||
746 | root->rnode = to_free->slots[0]; | 852 | /* |
853 | * We don't need rcu_assign_pointer(), since we are simply | ||
854 | * moving the node from one part of the tree to another. If | ||
855 | * it was safe to dereference the old pointer to it | ||
856 | * (to_free->slots[0]), it will be safe to dereference the new | ||
857 | * one (root->rnode). | ||
858 | */ | ||
859 | newptr = to_free->slots[0]; | ||
860 | if (root->height == 1) | ||
861 | newptr = radix_tree_ptr_to_direct(newptr); | ||
862 | root->rnode = newptr; | ||
747 | root->height--; | 863 | root->height--; |
748 | /* must only free zeroed nodes into the slab */ | 864 | /* must only free zeroed nodes into the slab */ |
749 | tag_clear(to_free, 0, 0); | 865 | tag_clear(to_free, 0, 0); |
@@ -767,6 +883,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) | |||
767 | { | 883 | { |
768 | struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path; | 884 | struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path; |
769 | struct radix_tree_node *slot = NULL; | 885 | struct radix_tree_node *slot = NULL; |
886 | struct radix_tree_node *to_free; | ||
770 | unsigned int height, shift; | 887 | unsigned int height, shift; |
771 | int tag; | 888 | int tag; |
772 | int offset; | 889 | int offset; |
@@ -777,6 +894,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) | |||
777 | 894 | ||
778 | slot = root->rnode; | 895 | slot = root->rnode; |
779 | if (height == 0 && root->rnode) { | 896 | if (height == 0 && root->rnode) { |
897 | slot = radix_tree_direct_to_ptr(slot); | ||
780 | root_tag_clear_all(root); | 898 | root_tag_clear_all(root); |
781 | root->rnode = NULL; | 899 | root->rnode = NULL; |
782 | goto out; | 900 | goto out; |
@@ -809,10 +927,17 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) | |||
809 | radix_tree_tag_clear(root, index, tag); | 927 | radix_tree_tag_clear(root, index, tag); |
810 | } | 928 | } |
811 | 929 | ||
930 | to_free = NULL; | ||
812 | /* Now free the nodes we do not need anymore */ | 931 | /* Now free the nodes we do not need anymore */ |
813 | while (pathp->node) { | 932 | while (pathp->node) { |
814 | pathp->node->slots[pathp->offset] = NULL; | 933 | pathp->node->slots[pathp->offset] = NULL; |
815 | pathp->node->count--; | 934 | pathp->node->count--; |
935 | /* | ||
936 | * Queue the node for deferred freeing after the | ||
937 | * last reference to it disappears (set NULL, above). | ||
938 | */ | ||
939 | if (to_free) | ||
940 | radix_tree_node_free(to_free); | ||
816 | 941 | ||
817 | if (pathp->node->count) { | 942 | if (pathp->node->count) { |
818 | if (pathp->node == root->rnode) | 943 | if (pathp->node == root->rnode) |
@@ -821,13 +946,15 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) | |||
821 | } | 946 | } |
822 | 947 | ||
823 | /* Node with zero slots in use so free it */ | 948 | /* Node with zero slots in use so free it */ |
824 | radix_tree_node_free(pathp->node); | 949 | to_free = pathp->node; |
825 | |||
826 | pathp--; | 950 | pathp--; |
951 | |||
827 | } | 952 | } |
828 | root_tag_clear_all(root); | 953 | root_tag_clear_all(root); |
829 | root->height = 0; | 954 | root->height = 0; |
830 | root->rnode = NULL; | 955 | root->rnode = NULL; |
956 | if (to_free) | ||
957 | radix_tree_node_free(to_free); | ||
831 | 958 | ||
832 | out: | 959 | out: |
833 | return slot; | 960 | return slot; |
diff --git a/mm/migrate.c b/mm/migrate.c index b4979d423d2b..e9b161bde95b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -294,7 +294,7 @@ out: | |||
294 | static int migrate_page_move_mapping(struct address_space *mapping, | 294 | static int migrate_page_move_mapping(struct address_space *mapping, |
295 | struct page *newpage, struct page *page) | 295 | struct page *newpage, struct page *page) |
296 | { | 296 | { |
297 | struct page **radix_pointer; | 297 | void **pslot; |
298 | 298 | ||
299 | if (!mapping) { | 299 | if (!mapping) { |
300 | /* Anonymous page */ | 300 | /* Anonymous page */ |
@@ -305,12 +305,11 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
305 | 305 | ||
306 | write_lock_irq(&mapping->tree_lock); | 306 | write_lock_irq(&mapping->tree_lock); |
307 | 307 | ||
308 | radix_pointer = (struct page **)radix_tree_lookup_slot( | 308 | pslot = radix_tree_lookup_slot(&mapping->page_tree, |
309 | &mapping->page_tree, | 309 | page_index(page)); |
310 | page_index(page)); | ||
311 | 310 | ||
312 | if (page_count(page) != 2 + !!PagePrivate(page) || | 311 | if (page_count(page) != 2 + !!PagePrivate(page) || |
313 | *radix_pointer != page) { | 312 | (struct page *)radix_tree_deref_slot(pslot) != page) { |
314 | write_unlock_irq(&mapping->tree_lock); | 313 | write_unlock_irq(&mapping->tree_lock); |
315 | return -EAGAIN; | 314 | return -EAGAIN; |
316 | } | 315 | } |
@@ -318,7 +317,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
318 | /* | 317 | /* |
319 | * Now we know that no one else is looking at the page. | 318 | * Now we know that no one else is looking at the page. |
320 | */ | 319 | */ |
321 | get_page(newpage); | 320 | get_page(newpage); /* add cache reference */ |
322 | #ifdef CONFIG_SWAP | 321 | #ifdef CONFIG_SWAP |
323 | if (PageSwapCache(page)) { | 322 | if (PageSwapCache(page)) { |
324 | SetPageSwapCache(newpage); | 323 | SetPageSwapCache(newpage); |
@@ -326,8 +325,14 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
326 | } | 325 | } |
327 | #endif | 326 | #endif |
328 | 327 | ||
329 | *radix_pointer = newpage; | 328 | radix_tree_replace_slot(pslot, newpage); |
329 | |||
330 | /* | ||
331 | * Drop cache reference from old page. | ||
332 | * We know this isn't the last reference. | ||
333 | */ | ||
330 | __put_page(page); | 334 | __put_page(page); |
335 | |||
331 | write_unlock_irq(&mapping->tree_lock); | 336 | write_unlock_irq(&mapping->tree_lock); |
332 | 337 | ||
333 | return 0; | 338 | return 0; |