aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNick Piggin <nickpiggin@yahoo.com.au>2008-06-12 18:21:52 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-06-12 21:05:41 -0400
commit643b52b9c0b4e959436b4b551ebf4060d06d5ae8 (patch)
tree5ccce7688ba638e863a391ca84441d081e666f99
parentd2187ebd84c7dd13ef269e9600f4daebeb02816e (diff)
radix-tree: fix small lockless radix-tree bug
We shrink a radix tree when its root node has only one child, in the left most slot. The child becomes the new root node. To perform this operation in a manner compatible with concurrent lockless lookups, we atomically switch the root pointer from the parent to its child. However a concurrent lockless lookup may now have loaded a pointer to the parent (and is presently deciding what to do next). For this reason, we also have to keep the parent node in a valid state after shrinking the tree, until the next RCU grace period -- otherwise this lookup with the parent pointer may not do the right thing. Notably, we need to keep the child in the left most slot there in case that is requested by the lookup. This is all pretty standard RCU stuff. It is worth repeating because in my eagerness to obey the radix tree node constructor scheme, I had broken it by zeroing the radix tree node before the grace period. What could happen is that a lookup can load the parent pointer, then decide it wants to follow the left most child slot, only to find the slot contained NULL due to the concurrent shrinker having zeroed the parent node before waiting for a grace period. The lookup would return a false negative as a result. Fix it by doing that clearing in the RCU callback. I would normally want to rip out the constructor entirely, but radix tree nodes are one of those places where they make sense (only few cachelines will be touched soon after allocation). This was never actually found in any lockless pagecache testing or by the test harness, but by seeing the odd problem with my scalable vmap rewrite. I have not tickled the test harness into reproducing it yet, but I'll keep working at it. Fortunately, it is not a problem anywhere lockless pagecache is used in mainline kernels (pagecache probe is not a guarantee, and brd does not have concurrent lookups and deletes). Signed-off-by: Nick Piggin <npiggin@suse.de> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: "Paul E. McKenney" <paulmck@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--lib/radix-tree.c120
1 files changed, 62 insertions, 58 deletions
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index bd521716ab1a..169a2f8dabcc 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -88,6 +88,57 @@ static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
88 return root->gfp_mask & __GFP_BITS_MASK; 88 return root->gfp_mask & __GFP_BITS_MASK;
89} 89}
90 90
91static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
92 int offset)
93{
94 __set_bit(offset, node->tags[tag]);
95}
96
97static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
98 int offset)
99{
100 __clear_bit(offset, node->tags[tag]);
101}
102
103static inline int tag_get(struct radix_tree_node *node, unsigned int tag,
104 int offset)
105{
106 return test_bit(offset, node->tags[tag]);
107}
108
109static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag)
110{
111 root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT));
112}
113
114static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag)
115{
116 root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT));
117}
118
119static inline void root_tag_clear_all(struct radix_tree_root *root)
120{
121 root->gfp_mask &= __GFP_BITS_MASK;
122}
123
124static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
125{
126 return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
127}
128
129/*
130 * Returns 1 if any slot in the node has this tag set.
131 * Otherwise returns 0.
132 */
133static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
134{
135 int idx;
136 for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
137 if (node->tags[tag][idx])
138 return 1;
139 }
140 return 0;
141}
91/* 142/*
92 * This assumes that the caller has performed appropriate preallocation, and 143 * This assumes that the caller has performed appropriate preallocation, and
93 * that the caller has pinned this thread of control to the current CPU. 144 * that the caller has pinned this thread of control to the current CPU.
@@ -124,6 +175,17 @@ static void radix_tree_node_rcu_free(struct rcu_head *head)
124{ 175{
125 struct radix_tree_node *node = 176 struct radix_tree_node *node =
126 container_of(head, struct radix_tree_node, rcu_head); 177 container_of(head, struct radix_tree_node, rcu_head);
178
179 /*
180 * must only free zeroed nodes into the slab. radix_tree_shrink
181 * can leave us with a non-NULL entry in the first slot, so clear
182 * that here to make sure.
183 */
184 tag_clear(node, 0, 0);
185 tag_clear(node, 1, 0);
186 node->slots[0] = NULL;
187 node->count = 0;
188
127 kmem_cache_free(radix_tree_node_cachep, node); 189 kmem_cache_free(radix_tree_node_cachep, node);
128} 190}
129 191
@@ -165,59 +227,6 @@ out:
165} 227}
166EXPORT_SYMBOL(radix_tree_preload); 228EXPORT_SYMBOL(radix_tree_preload);
167 229
168static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
169 int offset)
170{
171 __set_bit(offset, node->tags[tag]);
172}
173
174static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
175 int offset)
176{
177 __clear_bit(offset, node->tags[tag]);
178}
179
180static inline int tag_get(struct radix_tree_node *node, unsigned int tag,
181 int offset)
182{
183 return test_bit(offset, node->tags[tag]);
184}
185
186static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag)
187{
188 root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT));
189}
190
191
192static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag)
193{
194 root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT));
195}
196
197static inline void root_tag_clear_all(struct radix_tree_root *root)
198{
199 root->gfp_mask &= __GFP_BITS_MASK;
200}
201
202static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
203{
204 return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
205}
206
207/*
208 * Returns 1 if any slot in the node has this tag set.
209 * Otherwise returns 0.
210 */
211static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
212{
213 int idx;
214 for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
215 if (node->tags[tag][idx])
216 return 1;
217 }
218 return 0;
219}
220
221/* 230/*
222 * Return the maximum key which can be store into a 231 * Return the maximum key which can be store into a
223 * radix tree with height HEIGHT. 232 * radix tree with height HEIGHT.
@@ -930,11 +939,6 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
930 newptr = radix_tree_ptr_to_indirect(newptr); 939 newptr = radix_tree_ptr_to_indirect(newptr);
931 root->rnode = newptr; 940 root->rnode = newptr;
932 root->height--; 941 root->height--;
933 /* must only free zeroed nodes into the slab */
934 tag_clear(to_free, 0, 0);
935 tag_clear(to_free, 1, 0);
936 to_free->slots[0] = NULL;
937 to_free->count = 0;
938 radix_tree_node_free(to_free); 942 radix_tree_node_free(to_free);
939 } 943 }
940} 944}