diff options
author | Eric Dumazet <eric.dumazet@gmail.com> | 2010-06-15 04:23:14 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2010-06-15 17:23:38 -0400 |
commit | aa1039e73cc2cf834e99c09d2033d5d2675357b9 (patch) | |
tree | 0db06e4adddaf0f77b4e8de170710b74a17375e4 /net/ipv4 | |
parent | 7b34a4644b4342896e0c1967b8f953213ea4a990 (diff) |
inetpeer: RCU conversion
inetpeer currently uses an AVL tree protected by an rwlock.
It's possible to make most lookups use RCU
1) Add a struct rcu_head to struct inet_peer
2) add a lookup_rcu_bh() helper to perform lockless and opportunistic
lookup. This is a normal function, not a macro like lookup().
3) Add a limit to number of links followed by lookup_rcu_bh(). This is
needed in case we fall in a loop.
4) add an smp_wmb() in link_to_pool() right before node insert.
5) make unlink_from_pool() use atomic_cmpxchg() to make sure it can take
last reference to an inet_peer, since lockless readers could increase
refcount, even while we hold peers.lock.
6) Delay struct inet_peer freeing after rcu grace period so that
lookup_rcu_bh() cannot crash.
7) inet_getpeer() first attempts lockless lookup.
Note this lookup can fail even if target is in AVL tree, but a
concurrent writer can let tree in a non correct form.
If this attemps fails, lock is taken a regular lookup is performed
again.
8) convert peers.lock from rwlock to a spinlock
9) Remove SLAB_HWCACHE_ALIGN when peer_cachep is created, because
rcu_head adds 16 bytes on 64bit arches, doubling effective size (64 ->
128 bytes)
In a future patch, this is probably possible to revert this part, if rcu
field is put in an union to share space with rid, ip_id_count, tcp_ts &
tcp_ts_stamp. These fields being manipulated only with refcnt > 0.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/inetpeer.c | 164 |
1 files changed, 95 insertions, 69 deletions
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 035673fd42d4..58fbc7e2475e 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c | |||
@@ -51,8 +51,8 @@ | |||
51 | * lookups performed with disabled BHs. | 51 | * lookups performed with disabled BHs. |
52 | * | 52 | * |
53 | * Serialisation issues. | 53 | * Serialisation issues. |
54 | * 1. Nodes may appear in the tree only with the pool write lock held. | 54 | * 1. Nodes may appear in the tree only with the pool lock held. |
55 | * 2. Nodes may disappear from the tree only with the pool write lock held | 55 | * 2. Nodes may disappear from the tree only with the pool lock held |
56 | * AND reference count being 0. | 56 | * AND reference count being 0. |
57 | * 3. Nodes appears and disappears from unused node list only under | 57 | * 3. Nodes appears and disappears from unused node list only under |
58 | * "inet_peer_unused_lock". | 58 | * "inet_peer_unused_lock". |
@@ -80,11 +80,11 @@ static const struct inet_peer peer_fake_node = { | |||
80 | 80 | ||
81 | static struct { | 81 | static struct { |
82 | struct inet_peer *root; | 82 | struct inet_peer *root; |
83 | rwlock_t lock; | 83 | spinlock_t lock; |
84 | int total; | 84 | int total; |
85 | } peers = { | 85 | } peers = { |
86 | .root = peer_avl_empty, | 86 | .root = peer_avl_empty, |
87 | .lock = __RW_LOCK_UNLOCKED(peers.lock), | 87 | .lock = __SPIN_LOCK_UNLOCKED(peers.lock), |
88 | .total = 0, | 88 | .total = 0, |
89 | }; | 89 | }; |
90 | #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ | 90 | #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ |
@@ -129,7 +129,7 @@ void __init inet_initpeers(void) | |||
129 | 129 | ||
130 | peer_cachep = kmem_cache_create("inet_peer_cache", | 130 | peer_cachep = kmem_cache_create("inet_peer_cache", |
131 | sizeof(struct inet_peer), | 131 | sizeof(struct inet_peer), |
132 | 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, | 132 | 0, SLAB_PANIC, |
133 | NULL); | 133 | NULL); |
134 | 134 | ||
135 | /* All the timers, started at system startup tend | 135 | /* All the timers, started at system startup tend |
@@ -153,16 +153,13 @@ static void unlink_from_unused(struct inet_peer *p) | |||
153 | 153 | ||
154 | /* | 154 | /* |
155 | * Called with local BH disabled and the pool lock held. | 155 | * Called with local BH disabled and the pool lock held. |
156 | * _stack is known to be NULL or not at compile time, | ||
157 | * so compiler will optimize the if (_stack) tests. | ||
158 | */ | 156 | */ |
159 | #define lookup(_daddr, _stack) \ | 157 | #define lookup(_daddr, _stack) \ |
160 | ({ \ | 158 | ({ \ |
161 | struct inet_peer *u, **v; \ | 159 | struct inet_peer *u, **v; \ |
162 | if (_stack != NULL) { \ | 160 | \ |
163 | stackptr = _stack; \ | 161 | stackptr = _stack; \ |
164 | *stackptr++ = &peers.root; \ | 162 | *stackptr++ = &peers.root; \ |
165 | } \ | ||
166 | for (u = peers.root; u != peer_avl_empty; ) { \ | 163 | for (u = peers.root; u != peer_avl_empty; ) { \ |
167 | if (_daddr == u->v4daddr) \ | 164 | if (_daddr == u->v4daddr) \ |
168 | break; \ | 165 | break; \ |
@@ -170,14 +167,41 @@ static void unlink_from_unused(struct inet_peer *p) | |||
170 | v = &u->avl_left; \ | 167 | v = &u->avl_left; \ |
171 | else \ | 168 | else \ |
172 | v = &u->avl_right; \ | 169 | v = &u->avl_right; \ |
173 | if (_stack != NULL) \ | 170 | *stackptr++ = v; \ |
174 | *stackptr++ = v; \ | ||
175 | u = *v; \ | 171 | u = *v; \ |
176 | } \ | 172 | } \ |
177 | u; \ | 173 | u; \ |
178 | }) | 174 | }) |
179 | 175 | ||
180 | /* Called with local BH disabled and the pool write lock held. */ | 176 | /* |
177 | * Called with rcu_read_lock_bh() | ||
178 | * Because we hold no lock against a writer, its quite possible we fall | ||
179 | * in an endless loop. | ||
180 | * But every pointer we follow is guaranteed to be valid thanks to RCU. | ||
181 | * We exit from this function if number of links exceeds PEER_MAXDEPTH | ||
182 | */ | ||
183 | static struct inet_peer *lookup_rcu_bh(__be32 daddr) | ||
184 | { | ||
185 | struct inet_peer *u = rcu_dereference_bh(peers.root); | ||
186 | int count = 0; | ||
187 | |||
188 | while (u != peer_avl_empty) { | ||
189 | if (daddr == u->v4daddr) { | ||
190 | if (unlikely(!atomic_inc_not_zero(&u->refcnt))) | ||
191 | u = NULL; | ||
192 | return u; | ||
193 | } | ||
194 | if ((__force __u32)daddr < (__force __u32)u->v4daddr) | ||
195 | u = rcu_dereference_bh(u->avl_left); | ||
196 | else | ||
197 | u = rcu_dereference_bh(u->avl_right); | ||
198 | if (unlikely(++count == PEER_MAXDEPTH)) | ||
199 | break; | ||
200 | } | ||
201 | return NULL; | ||
202 | } | ||
203 | |||
204 | /* Called with local BH disabled and the pool lock held. */ | ||
181 | #define lookup_rightempty(start) \ | 205 | #define lookup_rightempty(start) \ |
182 | ({ \ | 206 | ({ \ |
183 | struct inet_peer *u, **v; \ | 207 | struct inet_peer *u, **v; \ |
@@ -191,9 +215,10 @@ static void unlink_from_unused(struct inet_peer *p) | |||
191 | u; \ | 215 | u; \ |
192 | }) | 216 | }) |
193 | 217 | ||
194 | /* Called with local BH disabled and the pool write lock held. | 218 | /* Called with local BH disabled and the pool lock held. |
195 | * Variable names are the proof of operation correctness. | 219 | * Variable names are the proof of operation correctness. |
196 | * Look into mm/map_avl.c for more detail description of the ideas. */ | 220 | * Look into mm/map_avl.c for more detail description of the ideas. |
221 | */ | ||
197 | static void peer_avl_rebalance(struct inet_peer **stack[], | 222 | static void peer_avl_rebalance(struct inet_peer **stack[], |
198 | struct inet_peer ***stackend) | 223 | struct inet_peer ***stackend) |
199 | { | 224 | { |
@@ -269,16 +294,22 @@ static void peer_avl_rebalance(struct inet_peer **stack[], | |||
269 | } | 294 | } |
270 | } | 295 | } |
271 | 296 | ||
272 | /* Called with local BH disabled and the pool write lock held. */ | 297 | /* Called with local BH disabled and the pool lock held. */ |
273 | #define link_to_pool(n) \ | 298 | #define link_to_pool(n) \ |
274 | do { \ | 299 | do { \ |
275 | n->avl_height = 1; \ | 300 | n->avl_height = 1; \ |
276 | n->avl_left = peer_avl_empty; \ | 301 | n->avl_left = peer_avl_empty; \ |
277 | n->avl_right = peer_avl_empty; \ | 302 | n->avl_right = peer_avl_empty; \ |
303 | smp_wmb(); /* lockless readers can catch us now */ \ | ||
278 | **--stackptr = n; \ | 304 | **--stackptr = n; \ |
279 | peer_avl_rebalance(stack, stackptr); \ | 305 | peer_avl_rebalance(stack, stackptr); \ |
280 | } while (0) | 306 | } while (0) |
281 | 307 | ||
308 | static void inetpeer_free_rcu(struct rcu_head *head) | ||
309 | { | ||
310 | kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu)); | ||
311 | } | ||
312 | |||
282 | /* May be called with local BH enabled. */ | 313 | /* May be called with local BH enabled. */ |
283 | static void unlink_from_pool(struct inet_peer *p) | 314 | static void unlink_from_pool(struct inet_peer *p) |
284 | { | 315 | { |
@@ -286,13 +317,13 @@ static void unlink_from_pool(struct inet_peer *p) | |||
286 | 317 | ||
287 | do_free = 0; | 318 | do_free = 0; |
288 | 319 | ||
289 | write_lock_bh(&peers.lock); | 320 | spin_lock_bh(&peers.lock); |
290 | /* Check the reference counter. It was artificially incremented by 1 | 321 | /* Check the reference counter. It was artificially incremented by 1 |
291 | * in cleanup() function to prevent sudden disappearing. If the | 322 | * in cleanup() function to prevent sudden disappearing. If we can |
292 | * reference count is still 1 then the node is referenced only as `p' | 323 | * atomically (because of lockless readers) take this last reference, |
293 | * here and from the pool. So under the exclusive pool lock it's safe | 324 | * it's safe to remove the node and free it later. |
294 | * to remove the node and free it later. */ | 325 | */ |
295 | if (atomic_read(&p->refcnt) == 1) { | 326 | if (atomic_cmpxchg(&p->refcnt, 1, 0) == 1) { |
296 | struct inet_peer **stack[PEER_MAXDEPTH]; | 327 | struct inet_peer **stack[PEER_MAXDEPTH]; |
297 | struct inet_peer ***stackptr, ***delp; | 328 | struct inet_peer ***stackptr, ***delp; |
298 | if (lookup(p->v4daddr, stack) != p) | 329 | if (lookup(p->v4daddr, stack) != p) |
@@ -321,17 +352,18 @@ static void unlink_from_pool(struct inet_peer *p) | |||
321 | peers.total--; | 352 | peers.total--; |
322 | do_free = 1; | 353 | do_free = 1; |
323 | } | 354 | } |
324 | write_unlock_bh(&peers.lock); | 355 | spin_unlock_bh(&peers.lock); |
325 | 356 | ||
326 | if (do_free) | 357 | if (do_free) |
327 | kmem_cache_free(peer_cachep, p); | 358 | call_rcu_bh(&p->rcu, inetpeer_free_rcu); |
328 | else | 359 | else |
329 | /* The node is used again. Decrease the reference counter | 360 | /* The node is used again. Decrease the reference counter |
330 | * back. The loop "cleanup -> unlink_from_unused | 361 | * back. The loop "cleanup -> unlink_from_unused |
331 | * -> unlink_from_pool -> putpeer -> link_to_unused | 362 | * -> unlink_from_pool -> putpeer -> link_to_unused |
332 | * -> cleanup (for the same node)" | 363 | * -> cleanup (for the same node)" |
333 | * doesn't really exist because the entry will have a | 364 | * doesn't really exist because the entry will have a |
334 | * recent deletion time and will not be cleaned again soon. */ | 365 | * recent deletion time and will not be cleaned again soon. |
366 | */ | ||
335 | inet_putpeer(p); | 367 | inet_putpeer(p); |
336 | } | 368 | } |
337 | 369 | ||
@@ -375,62 +407,56 @@ static int cleanup_once(unsigned long ttl) | |||
375 | /* Called with or without local BH being disabled. */ | 407 | /* Called with or without local BH being disabled. */ |
376 | struct inet_peer *inet_getpeer(__be32 daddr, int create) | 408 | struct inet_peer *inet_getpeer(__be32 daddr, int create) |
377 | { | 409 | { |
378 | struct inet_peer *p, *n; | 410 | struct inet_peer *p; |
379 | struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr; | 411 | struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr; |
380 | 412 | ||
381 | /* Look up for the address quickly. */ | 413 | /* Look up for the address quickly, lockless. |
382 | read_lock_bh(&peers.lock); | 414 | * Because of a concurrent writer, we might not find an existing entry. |
383 | p = lookup(daddr, NULL); | 415 | */ |
384 | if (p != peer_avl_empty) | 416 | rcu_read_lock_bh(); |
385 | atomic_inc(&p->refcnt); | 417 | p = lookup_rcu_bh(daddr); |
386 | read_unlock_bh(&peers.lock); | 418 | rcu_read_unlock_bh(); |
419 | |||
420 | if (p) { | ||
421 | /* The existing node has been found. | ||
422 | * Remove the entry from unused list if it was there. | ||
423 | */ | ||
424 | unlink_from_unused(p); | ||
425 | return p; | ||
426 | } | ||
387 | 427 | ||
428 | /* retry an exact lookup, taking the lock before. | ||
429 | * At least, nodes should be hot in our cache. | ||
430 | */ | ||
431 | spin_lock_bh(&peers.lock); | ||
432 | p = lookup(daddr, stack); | ||
388 | if (p != peer_avl_empty) { | 433 | if (p != peer_avl_empty) { |
389 | /* The existing node has been found. */ | 434 | atomic_inc(&p->refcnt); |
435 | spin_unlock_bh(&peers.lock); | ||
390 | /* Remove the entry from unused list if it was there. */ | 436 | /* Remove the entry from unused list if it was there. */ |
391 | unlink_from_unused(p); | 437 | unlink_from_unused(p); |
392 | return p; | 438 | return p; |
393 | } | 439 | } |
394 | 440 | p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL; | |
395 | if (!create) | 441 | if (p) { |
396 | return NULL; | 442 | p->v4daddr = daddr; |
397 | 443 | atomic_set(&p->refcnt, 1); | |
398 | /* Allocate the space outside the locked region. */ | 444 | atomic_set(&p->rid, 0); |
399 | n = kmem_cache_alloc(peer_cachep, GFP_ATOMIC); | 445 | atomic_set(&p->ip_id_count, secure_ip_id(daddr)); |
400 | if (n == NULL) | 446 | p->tcp_ts_stamp = 0; |
401 | return NULL; | 447 | INIT_LIST_HEAD(&p->unused); |
402 | n->v4daddr = daddr; | 448 | |
403 | atomic_set(&n->refcnt, 1); | 449 | |
404 | atomic_set(&n->rid, 0); | 450 | /* Link the node. */ |
405 | atomic_set(&n->ip_id_count, secure_ip_id(daddr)); | 451 | link_to_pool(p); |
406 | n->tcp_ts_stamp = 0; | 452 | peers.total++; |
407 | 453 | } | |
408 | write_lock_bh(&peers.lock); | 454 | spin_unlock_bh(&peers.lock); |
409 | /* Check if an entry has suddenly appeared. */ | ||
410 | p = lookup(daddr, stack); | ||
411 | if (p != peer_avl_empty) | ||
412 | goto out_free; | ||
413 | |||
414 | /* Link the node. */ | ||
415 | link_to_pool(n); | ||
416 | INIT_LIST_HEAD(&n->unused); | ||
417 | peers.total++; | ||
418 | write_unlock_bh(&peers.lock); | ||
419 | 455 | ||
420 | if (peers.total >= inet_peer_threshold) | 456 | if (peers.total >= inet_peer_threshold) |
421 | /* Remove one less-recently-used entry. */ | 457 | /* Remove one less-recently-used entry. */ |
422 | cleanup_once(0); | 458 | cleanup_once(0); |
423 | 459 | ||
424 | return n; | ||
425 | |||
426 | out_free: | ||
427 | /* The appropriate node is already in the pool. */ | ||
428 | atomic_inc(&p->refcnt); | ||
429 | write_unlock_bh(&peers.lock); | ||
430 | /* Remove the entry from unused list if it was there. */ | ||
431 | unlink_from_unused(p); | ||
432 | /* Free preallocated the preallocated node. */ | ||
433 | kmem_cache_free(peer_cachep, n); | ||
434 | return p; | 460 | return p; |
435 | } | 461 | } |
436 | 462 | ||