diff options
author | Eric Dumazet <eric.dumazet@gmail.com> | 2011-06-08 09:35:34 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2011-06-08 20:05:30 -0400 |
commit | 4b9d9be839fdb7dcd7ce7619a623fd9015a50cda (patch) | |
tree | bd1827203efe27578b783c30b0ff5e2d4966b26a /net/ipv4/inetpeer.c | |
parent | 9ad7c049f0f79c418e293b1b68cf10d68f54fcdb (diff) |
inetpeer: remove unused list
Andi Kleen and Tim Chen reported huge contention on inetpeer
unused_peers.lock, on memcached workload on a 40 core machine, with
disabled route cache.
It appears we constantly flip peers refcnt between 0 and 1 values, and
we must insert/remove peers from unused_peers.list, holding a contended
spinlock.
Remove this list completely and perform a garbage collection on-the-fly,
at lookup time, using the expired nodes we met during the tree
traversal.
This removes a lot of code, makes locking more standard, and obsoletes
two sysctls (inet_peer_gc_mintime and inet_peer_gc_maxtime). This also
removes two pointers in inet_peer structure.
There is still a false sharing effect because refcnt is in first cache
line of object [were the links and keys used by lookups are located], we
might move it at the end of inet_peer structure to let this first cache
line mostly read by cpus.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Andi Kleen <andi@firstfloor.org>
CC: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/inetpeer.c')
-rw-r--r-- | net/ipv4/inetpeer.c | 280 |
1 files changed, 73 insertions, 207 deletions
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index ce616d92cc54..dafbf2c98b28 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c | |||
@@ -54,15 +54,11 @@ | |||
54 | * 1. Nodes may appear in the tree only with the pool lock held. | 54 | * 1. Nodes may appear in the tree only with the pool lock held. |
55 | * 2. Nodes may disappear from the tree only with the pool lock held | 55 | * 2. Nodes may disappear from the tree only with the pool lock held |
56 | * AND reference count being 0. | 56 | * AND reference count being 0. |
57 | * 3. Nodes appears and disappears from unused node list only under | 57 | * 3. Global variable peer_total is modified under the pool lock. |
58 | * "inet_peer_unused_lock". | 58 | * 4. struct inet_peer fields modification: |
59 | * 4. Global variable peer_total is modified under the pool lock. | ||
60 | * 5. struct inet_peer fields modification: | ||
61 | * avl_left, avl_right, avl_parent, avl_height: pool lock | 59 | * avl_left, avl_right, avl_parent, avl_height: pool lock |
62 | * unused: unused node list lock | ||
63 | * refcnt: atomically against modifications on other CPU; | 60 | * refcnt: atomically against modifications on other CPU; |
64 | * usually under some other lock to prevent node disappearing | 61 | * usually under some other lock to prevent node disappearing |
65 | * dtime: unused node list lock | ||
66 | * daddr: unchangeable | 62 | * daddr: unchangeable |
67 | * ip_id_count: atomic value (no lock needed) | 63 | * ip_id_count: atomic value (no lock needed) |
68 | */ | 64 | */ |
@@ -104,19 +100,6 @@ int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries m | |||
104 | * aggressively at this stage */ | 100 | * aggressively at this stage */ |
105 | int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ | 101 | int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ |
106 | int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */ | 102 | int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */ |
107 | int inet_peer_gc_mintime __read_mostly = 10 * HZ; | ||
108 | int inet_peer_gc_maxtime __read_mostly = 120 * HZ; | ||
109 | |||
110 | static struct { | ||
111 | struct list_head list; | ||
112 | spinlock_t lock; | ||
113 | } unused_peers = { | ||
114 | .list = LIST_HEAD_INIT(unused_peers.list), | ||
115 | .lock = __SPIN_LOCK_UNLOCKED(unused_peers.lock), | ||
116 | }; | ||
117 | |||
118 | static void peer_check_expire(unsigned long dummy); | ||
119 | static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0); | ||
120 | 103 | ||
121 | 104 | ||
122 | /* Called from ip_output.c:ip_init */ | 105 | /* Called from ip_output.c:ip_init */ |
@@ -142,21 +125,6 @@ void __init inet_initpeers(void) | |||
142 | 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, | 125 | 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, |
143 | NULL); | 126 | NULL); |
144 | 127 | ||
145 | /* All the timers, started at system startup tend | ||
146 | to synchronize. Perturb it a bit. | ||
147 | */ | ||
148 | peer_periodic_timer.expires = jiffies | ||
149 | + net_random() % inet_peer_gc_maxtime | ||
150 | + inet_peer_gc_maxtime; | ||
151 | add_timer(&peer_periodic_timer); | ||
152 | } | ||
153 | |||
154 | /* Called with or without local BH being disabled. */ | ||
155 | static void unlink_from_unused(struct inet_peer *p) | ||
156 | { | ||
157 | spin_lock_bh(&unused_peers.lock); | ||
158 | list_del_init(&p->unused); | ||
159 | spin_unlock_bh(&unused_peers.lock); | ||
160 | } | 128 | } |
161 | 129 | ||
162 | static int addr_compare(const struct inetpeer_addr *a, | 130 | static int addr_compare(const struct inetpeer_addr *a, |
@@ -203,20 +171,6 @@ static int addr_compare(const struct inetpeer_addr *a, | |||
203 | u; \ | 171 | u; \ |
204 | }) | 172 | }) |
205 | 173 | ||
206 | static bool atomic_add_unless_return(atomic_t *ptr, int a, int u, int *newv) | ||
207 | { | ||
208 | int cur, old = atomic_read(ptr); | ||
209 | |||
210 | while (old != u) { | ||
211 | *newv = old + a; | ||
212 | cur = atomic_cmpxchg(ptr, old, *newv); | ||
213 | if (cur == old) | ||
214 | return true; | ||
215 | old = cur; | ||
216 | } | ||
217 | return false; | ||
218 | } | ||
219 | |||
220 | /* | 174 | /* |
221 | * Called with rcu_read_lock() | 175 | * Called with rcu_read_lock() |
222 | * Because we hold no lock against a writer, its quite possible we fall | 176 | * Because we hold no lock against a writer, its quite possible we fall |
@@ -225,8 +179,7 @@ static bool atomic_add_unless_return(atomic_t *ptr, int a, int u, int *newv) | |||
225 | * We exit from this function if number of links exceeds PEER_MAXDEPTH | 179 | * We exit from this function if number of links exceeds PEER_MAXDEPTH |
226 | */ | 180 | */ |
227 | static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr, | 181 | static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr, |
228 | struct inet_peer_base *base, | 182 | struct inet_peer_base *base) |
229 | int *newrefcnt) | ||
230 | { | 183 | { |
231 | struct inet_peer *u = rcu_dereference(base->root); | 184 | struct inet_peer *u = rcu_dereference(base->root); |
232 | int count = 0; | 185 | int count = 0; |
@@ -235,11 +188,9 @@ static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr, | |||
235 | int cmp = addr_compare(daddr, &u->daddr); | 188 | int cmp = addr_compare(daddr, &u->daddr); |
236 | if (cmp == 0) { | 189 | if (cmp == 0) { |
237 | /* Before taking a reference, check if this entry was | 190 | /* Before taking a reference, check if this entry was |
238 | * deleted, unlink_from_pool() sets refcnt=-1 to make | 191 | * deleted (refcnt=-1) |
239 | * distinction between an unused entry (refcnt=0) and | ||
240 | * a freed one. | ||
241 | */ | 192 | */ |
242 | if (!atomic_add_unless_return(&u->refcnt, 1, -1, newrefcnt)) | 193 | if (!atomic_add_unless(&u->refcnt, 1, -1)) |
243 | u = NULL; | 194 | u = NULL; |
244 | return u; | 195 | return u; |
245 | } | 196 | } |
@@ -366,137 +317,96 @@ static void inetpeer_free_rcu(struct rcu_head *head) | |||
366 | kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu)); | 317 | kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu)); |
367 | } | 318 | } |
368 | 319 | ||
369 | /* May be called with local BH enabled. */ | ||
370 | static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base, | 320 | static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base, |
371 | struct inet_peer __rcu **stack[PEER_MAXDEPTH]) | 321 | struct inet_peer __rcu **stack[PEER_MAXDEPTH]) |
372 | { | 322 | { |
373 | int do_free; | 323 | struct inet_peer __rcu ***stackptr, ***delp; |
374 | 324 | ||
375 | do_free = 0; | 325 | if (lookup(&p->daddr, stack, base) != p) |
376 | 326 | BUG(); | |
377 | write_seqlock_bh(&base->lock); | 327 | delp = stackptr - 1; /* *delp[0] == p */ |
378 | /* Check the reference counter. It was artificially incremented by 1 | 328 | if (p->avl_left == peer_avl_empty_rcu) { |
379 | * in cleanup() function to prevent sudden disappearing. If we can | 329 | *delp[0] = p->avl_right; |
380 | * atomically (because of lockless readers) take this last reference, | 330 | --stackptr; |
381 | * it's safe to remove the node and free it later. | 331 | } else { |
382 | * We use refcnt=-1 to alert lockless readers this entry is deleted. | 332 | /* look for a node to insert instead of p */ |
383 | */ | 333 | struct inet_peer *t; |
384 | if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) { | 334 | t = lookup_rightempty(p, base); |
385 | struct inet_peer __rcu ***stackptr, ***delp; | 335 | BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t); |
386 | if (lookup(&p->daddr, stack, base) != p) | 336 | **--stackptr = t->avl_left; |
387 | BUG(); | 337 | /* t is removed, t->daddr > x->daddr for any |
388 | delp = stackptr - 1; /* *delp[0] == p */ | 338 | * x in p->avl_left subtree. |
389 | if (p->avl_left == peer_avl_empty_rcu) { | 339 | * Put t in the old place of p. */ |
390 | *delp[0] = p->avl_right; | 340 | RCU_INIT_POINTER(*delp[0], t); |
391 | --stackptr; | 341 | t->avl_left = p->avl_left; |
392 | } else { | 342 | t->avl_right = p->avl_right; |
393 | /* look for a node to insert instead of p */ | 343 | t->avl_height = p->avl_height; |
394 | struct inet_peer *t; | 344 | BUG_ON(delp[1] != &p->avl_left); |
395 | t = lookup_rightempty(p, base); | 345 | delp[1] = &t->avl_left; /* was &p->avl_left */ |
396 | BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t); | ||
397 | **--stackptr = t->avl_left; | ||
398 | /* t is removed, t->daddr > x->daddr for any | ||
399 | * x in p->avl_left subtree. | ||
400 | * Put t in the old place of p. */ | ||
401 | RCU_INIT_POINTER(*delp[0], t); | ||
402 | t->avl_left = p->avl_left; | ||
403 | t->avl_right = p->avl_right; | ||
404 | t->avl_height = p->avl_height; | ||
405 | BUG_ON(delp[1] != &p->avl_left); | ||
406 | delp[1] = &t->avl_left; /* was &p->avl_left */ | ||
407 | } | ||
408 | peer_avl_rebalance(stack, stackptr, base); | ||
409 | base->total--; | ||
410 | do_free = 1; | ||
411 | } | 346 | } |
412 | write_sequnlock_bh(&base->lock); | 347 | peer_avl_rebalance(stack, stackptr, base); |
413 | 348 | base->total--; | |
414 | if (do_free) | 349 | call_rcu(&p->rcu, inetpeer_free_rcu); |
415 | call_rcu(&p->rcu, inetpeer_free_rcu); | ||
416 | else | ||
417 | /* The node is used again. Decrease the reference counter | ||
418 | * back. The loop "cleanup -> unlink_from_unused | ||
419 | * -> unlink_from_pool -> putpeer -> link_to_unused | ||
420 | * -> cleanup (for the same node)" | ||
421 | * doesn't really exist because the entry will have a | ||
422 | * recent deletion time and will not be cleaned again soon. | ||
423 | */ | ||
424 | inet_putpeer(p); | ||
425 | } | 350 | } |
426 | 351 | ||
427 | static struct inet_peer_base *family_to_base(int family) | 352 | static struct inet_peer_base *family_to_base(int family) |
428 | { | 353 | { |
429 | return (family == AF_INET ? &v4_peers : &v6_peers); | 354 | return family == AF_INET ? &v4_peers : &v6_peers; |
430 | } | 355 | } |
431 | 356 | ||
432 | static struct inet_peer_base *peer_to_base(struct inet_peer *p) | 357 | /* perform garbage collect on all items stacked during a lookup */ |
358 | static int inet_peer_gc(struct inet_peer_base *base, | ||
359 | struct inet_peer __rcu **stack[PEER_MAXDEPTH], | ||
360 | struct inet_peer __rcu ***stackptr) | ||
433 | { | 361 | { |
434 | return family_to_base(p->daddr.family); | 362 | struct inet_peer *p, *gchead = NULL; |
435 | } | 363 | __u32 delta, ttl; |
436 | 364 | int cnt = 0; | |
437 | /* May be called with local BH enabled. */ | ||
438 | static int cleanup_once(unsigned long ttl, struct inet_peer __rcu **stack[PEER_MAXDEPTH]) | ||
439 | { | ||
440 | struct inet_peer *p = NULL; | ||
441 | |||
442 | /* Remove the first entry from the list of unused nodes. */ | ||
443 | spin_lock_bh(&unused_peers.lock); | ||
444 | if (!list_empty(&unused_peers.list)) { | ||
445 | __u32 delta; | ||
446 | 365 | ||
447 | p = list_first_entry(&unused_peers.list, struct inet_peer, unused); | 366 | if (base->total >= inet_peer_threshold) |
367 | ttl = 0; /* be aggressive */ | ||
368 | else | ||
369 | ttl = inet_peer_maxttl | ||
370 | - (inet_peer_maxttl - inet_peer_minttl) / HZ * | ||
371 | base->total / inet_peer_threshold * HZ; | ||
372 | stackptr--; /* last stack slot is peer_avl_empty */ | ||
373 | while (stackptr > stack) { | ||
374 | stackptr--; | ||
375 | p = rcu_deref_locked(**stackptr, base); | ||
448 | delta = (__u32)jiffies - p->dtime; | 376 | delta = (__u32)jiffies - p->dtime; |
449 | 377 | if (atomic_read(&p->refcnt) == 0 && delta >= ttl && | |
450 | if (delta < ttl) { | 378 | atomic_cmpxchg(&p->refcnt, 0, -1) == 0) { |
451 | /* Do not prune fresh entries. */ | 379 | p->gc_next = gchead; |
452 | spin_unlock_bh(&unused_peers.lock); | 380 | gchead = p; |
453 | return -1; | ||
454 | } | 381 | } |
455 | |||
456 | list_del_init(&p->unused); | ||
457 | |||
458 | /* Grab an extra reference to prevent node disappearing | ||
459 | * before unlink_from_pool() call. */ | ||
460 | atomic_inc(&p->refcnt); | ||
461 | } | 382 | } |
462 | spin_unlock_bh(&unused_peers.lock); | 383 | while ((p = gchead) != NULL) { |
463 | 384 | gchead = p->gc_next; | |
464 | if (p == NULL) | 385 | cnt++; |
465 | /* It means that the total number of USED entries has | 386 | unlink_from_pool(p, base, stack); |
466 | * grown over inet_peer_threshold. It shouldn't really | 387 | } |
467 | * happen because of entry limits in route cache. */ | 388 | return cnt; |
468 | return -1; | ||
469 | |||
470 | unlink_from_pool(p, peer_to_base(p), stack); | ||
471 | return 0; | ||
472 | } | 389 | } |
473 | 390 | ||
474 | /* Called with or without local BH being disabled. */ | ||
475 | struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) | 391 | struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) |
476 | { | 392 | { |
477 | struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; | 393 | struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; |
478 | struct inet_peer_base *base = family_to_base(daddr->family); | 394 | struct inet_peer_base *base = family_to_base(daddr->family); |
479 | struct inet_peer *p; | 395 | struct inet_peer *p; |
480 | unsigned int sequence; | 396 | unsigned int sequence; |
481 | int invalidated, newrefcnt = 0; | 397 | int invalidated, gccnt = 0; |
482 | 398 | ||
483 | /* Look up for the address quickly, lockless. | 399 | /* Attempt a lockless lookup first. |
484 | * Because of a concurrent writer, we might not find an existing entry. | 400 | * Because of a concurrent writer, we might not find an existing entry. |
485 | */ | 401 | */ |
486 | rcu_read_lock(); | 402 | rcu_read_lock(); |
487 | sequence = read_seqbegin(&base->lock); | 403 | sequence = read_seqbegin(&base->lock); |
488 | p = lookup_rcu(daddr, base, &newrefcnt); | 404 | p = lookup_rcu(daddr, base); |
489 | invalidated = read_seqretry(&base->lock, sequence); | 405 | invalidated = read_seqretry(&base->lock, sequence); |
490 | rcu_read_unlock(); | 406 | rcu_read_unlock(); |
491 | 407 | ||
492 | if (p) { | 408 | if (p) |
493 | found: /* The existing node has been found. | ||
494 | * Remove the entry from unused list if it was there. | ||
495 | */ | ||
496 | if (newrefcnt == 1) | ||
497 | unlink_from_unused(p); | ||
498 | return p; | 409 | return p; |
499 | } | ||
500 | 410 | ||
501 | /* If no writer did a change during our lookup, we can return early. */ | 411 | /* If no writer did a change during our lookup, we can return early. */ |
502 | if (!create && !invalidated) | 412 | if (!create && !invalidated) |
@@ -506,11 +416,17 @@ found: /* The existing node has been found. | |||
506 | * At least, nodes should be hot in our cache. | 416 | * At least, nodes should be hot in our cache. |
507 | */ | 417 | */ |
508 | write_seqlock_bh(&base->lock); | 418 | write_seqlock_bh(&base->lock); |
419 | relookup: | ||
509 | p = lookup(daddr, stack, base); | 420 | p = lookup(daddr, stack, base); |
510 | if (p != peer_avl_empty) { | 421 | if (p != peer_avl_empty) { |
511 | newrefcnt = atomic_inc_return(&p->refcnt); | 422 | atomic_inc(&p->refcnt); |
512 | write_sequnlock_bh(&base->lock); | 423 | write_sequnlock_bh(&base->lock); |
513 | goto found; | 424 | return p; |
425 | } | ||
426 | if (!gccnt) { | ||
427 | gccnt = inet_peer_gc(base, stack, stackptr); | ||
428 | if (gccnt && create) | ||
429 | goto relookup; | ||
514 | } | 430 | } |
515 | p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL; | 431 | p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL; |
516 | if (p) { | 432 | if (p) { |
@@ -525,7 +441,6 @@ found: /* The existing node has been found. | |||
525 | p->pmtu_expires = 0; | 441 | p->pmtu_expires = 0; |
526 | p->pmtu_orig = 0; | 442 | p->pmtu_orig = 0; |
527 | memset(&p->redirect_learned, 0, sizeof(p->redirect_learned)); | 443 | memset(&p->redirect_learned, 0, sizeof(p->redirect_learned)); |
528 | INIT_LIST_HEAD(&p->unused); | ||
529 | 444 | ||
530 | 445 | ||
531 | /* Link the node. */ | 446 | /* Link the node. */ |
@@ -534,63 +449,14 @@ found: /* The existing node has been found. | |||
534 | } | 449 | } |
535 | write_sequnlock_bh(&base->lock); | 450 | write_sequnlock_bh(&base->lock); |
536 | 451 | ||
537 | if (base->total >= inet_peer_threshold) | ||
538 | /* Remove one less-recently-used entry. */ | ||
539 | cleanup_once(0, stack); | ||
540 | |||
541 | return p; | 452 | return p; |
542 | } | 453 | } |
543 | |||
544 | static int compute_total(void) | ||
545 | { | ||
546 | return v4_peers.total + v6_peers.total; | ||
547 | } | ||
548 | EXPORT_SYMBOL_GPL(inet_getpeer); | 454 | EXPORT_SYMBOL_GPL(inet_getpeer); |
549 | 455 | ||
550 | /* Called with local BH disabled. */ | ||
551 | static void peer_check_expire(unsigned long dummy) | ||
552 | { | ||
553 | unsigned long now = jiffies; | ||
554 | int ttl, total; | ||
555 | struct inet_peer __rcu **stack[PEER_MAXDEPTH]; | ||
556 | |||
557 | total = compute_total(); | ||
558 | if (total >= inet_peer_threshold) | ||
559 | ttl = inet_peer_minttl; | ||
560 | else | ||
561 | ttl = inet_peer_maxttl | ||
562 | - (inet_peer_maxttl - inet_peer_minttl) / HZ * | ||
563 | total / inet_peer_threshold * HZ; | ||
564 | while (!cleanup_once(ttl, stack)) { | ||
565 | if (jiffies != now) | ||
566 | break; | ||
567 | } | ||
568 | |||
569 | /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime | ||
570 | * interval depending on the total number of entries (more entries, | ||
571 | * less interval). */ | ||
572 | total = compute_total(); | ||
573 | if (total >= inet_peer_threshold) | ||
574 | peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime; | ||
575 | else | ||
576 | peer_periodic_timer.expires = jiffies | ||
577 | + inet_peer_gc_maxtime | ||
578 | - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ * | ||
579 | total / inet_peer_threshold * HZ; | ||
580 | add_timer(&peer_periodic_timer); | ||
581 | } | ||
582 | |||
583 | void inet_putpeer(struct inet_peer *p) | 456 | void inet_putpeer(struct inet_peer *p) |
584 | { | 457 | { |
585 | local_bh_disable(); | 458 | p->dtime = (__u32)jiffies; |
586 | 459 | atomic_dec(&p->refcnt); | |
587 | if (atomic_dec_and_lock(&p->refcnt, &unused_peers.lock)) { | ||
588 | list_add_tail(&p->unused, &unused_peers.list); | ||
589 | p->dtime = (__u32)jiffies; | ||
590 | spin_unlock(&unused_peers.lock); | ||
591 | } | ||
592 | |||
593 | local_bh_enable(); | ||
594 | } | 460 | } |
595 | EXPORT_SYMBOL_GPL(inet_putpeer); | 461 | EXPORT_SYMBOL_GPL(inet_putpeer); |
596 | 462 | ||