aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/networking/ip-sysctl.txt10
-rw-r--r--include/net/inetpeer.h2
-rw-r--r--include/net/ip.h2
-rw-r--r--net/ipv4/inetpeer.c280
-rw-r--r--net/ipv4/sysctl_net_ipv4.c14
5 files changed, 74 insertions, 234 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index d3d653a5f9b9..3dcb26cd33f6 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -106,16 +106,6 @@ inet_peer_maxttl - INTEGER
106 when the number of entries in the pool is very small). 106 when the number of entries in the pool is very small).
107 Measured in seconds. 107 Measured in seconds.
108 108
109inet_peer_gc_mintime - INTEGER
110 Minimum interval between garbage collection passes. This interval is
111 in effect under high memory pressure on the pool.
112 Measured in seconds.
113
114inet_peer_gc_maxtime - INTEGER
115 Minimum interval between garbage collection passes. This interval is
116 in effect under low (or absent) memory pressure on the pool.
117 Measured in seconds.
118
119TCP variables: 109TCP variables:
120 110
121somaxconn - INTEGER 111somaxconn - INTEGER
diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index 8a159cc3d68b..1f0966f91760 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -32,7 +32,6 @@ struct inet_peer {
32 struct inet_peer __rcu *avl_left, *avl_right; 32 struct inet_peer __rcu *avl_left, *avl_right;
33 struct inetpeer_addr daddr; 33 struct inetpeer_addr daddr;
34 __u32 avl_height; 34 __u32 avl_height;
35 struct list_head unused;
36 __u32 dtime; /* the time of last use of not 35 __u32 dtime; /* the time of last use of not
37 * referenced entries */ 36 * referenced entries */
38 atomic_t refcnt; 37 atomic_t refcnt;
@@ -56,6 +55,7 @@ struct inet_peer {
56 struct inetpeer_addr_base redirect_learned; 55 struct inetpeer_addr_base redirect_learned;
57 }; 56 };
58 struct rcu_head rcu; 57 struct rcu_head rcu;
58 struct inet_peer *gc_next;
59 }; 59 };
60}; 60};
61 61
diff --git a/include/net/ip.h b/include/net/ip.h
index 66dd49149208..e9ea7c73f8c1 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -228,8 +228,6 @@ extern struct ctl_path net_ipv4_ctl_path[];
228extern int inet_peer_threshold; 228extern int inet_peer_threshold;
229extern int inet_peer_minttl; 229extern int inet_peer_minttl;
230extern int inet_peer_maxttl; 230extern int inet_peer_maxttl;
231extern int inet_peer_gc_mintime;
232extern int inet_peer_gc_maxtime;
233 231
234/* From ip_output.c */ 232/* From ip_output.c */
235extern int sysctl_ip_dynaddr; 233extern int sysctl_ip_dynaddr;
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index ce616d92cc54..dafbf2c98b28 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -54,15 +54,11 @@
54 * 1. Nodes may appear in the tree only with the pool lock held. 54 * 1. Nodes may appear in the tree only with the pool lock held.
55 * 2. Nodes may disappear from the tree only with the pool lock held 55 * 2. Nodes may disappear from the tree only with the pool lock held
56 * AND reference count being 0. 56 * AND reference count being 0.
57 * 3. Nodes appears and disappears from unused node list only under 57 * 3. Global variable peer_total is modified under the pool lock.
58 * "inet_peer_unused_lock". 58 * 4. struct inet_peer fields modification:
59 * 4. Global variable peer_total is modified under the pool lock.
60 * 5. struct inet_peer fields modification:
61 * avl_left, avl_right, avl_parent, avl_height: pool lock 59 * avl_left, avl_right, avl_parent, avl_height: pool lock
62 * unused: unused node list lock
63 * refcnt: atomically against modifications on other CPU; 60 * refcnt: atomically against modifications on other CPU;
64 * usually under some other lock to prevent node disappearing 61 * usually under some other lock to prevent node disappearing
65 * dtime: unused node list lock
66 * daddr: unchangeable 62 * daddr: unchangeable
67 * ip_id_count: atomic value (no lock needed) 63 * ip_id_count: atomic value (no lock needed)
68 */ 64 */
@@ -104,19 +100,6 @@ int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries m
104 * aggressively at this stage */ 100 * aggressively at this stage */
105int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ 101int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */
106int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */ 102int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */
107int inet_peer_gc_mintime __read_mostly = 10 * HZ;
108int inet_peer_gc_maxtime __read_mostly = 120 * HZ;
109
110static struct {
111 struct list_head list;
112 spinlock_t lock;
113} unused_peers = {
114 .list = LIST_HEAD_INIT(unused_peers.list),
115 .lock = __SPIN_LOCK_UNLOCKED(unused_peers.lock),
116};
117
118static void peer_check_expire(unsigned long dummy);
119static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0);
120 103
121 104
122/* Called from ip_output.c:ip_init */ 105/* Called from ip_output.c:ip_init */
@@ -142,21 +125,6 @@ void __init inet_initpeers(void)
142 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, 125 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
143 NULL); 126 NULL);
144 127
145 /* All the timers, started at system startup tend
146 to synchronize. Perturb it a bit.
147 */
148 peer_periodic_timer.expires = jiffies
149 + net_random() % inet_peer_gc_maxtime
150 + inet_peer_gc_maxtime;
151 add_timer(&peer_periodic_timer);
152}
153
154/* Called with or without local BH being disabled. */
155static void unlink_from_unused(struct inet_peer *p)
156{
157 spin_lock_bh(&unused_peers.lock);
158 list_del_init(&p->unused);
159 spin_unlock_bh(&unused_peers.lock);
160} 128}
161 129
162static int addr_compare(const struct inetpeer_addr *a, 130static int addr_compare(const struct inetpeer_addr *a,
@@ -203,20 +171,6 @@ static int addr_compare(const struct inetpeer_addr *a,
203 u; \ 171 u; \
204}) 172})
205 173
206static bool atomic_add_unless_return(atomic_t *ptr, int a, int u, int *newv)
207{
208 int cur, old = atomic_read(ptr);
209
210 while (old != u) {
211 *newv = old + a;
212 cur = atomic_cmpxchg(ptr, old, *newv);
213 if (cur == old)
214 return true;
215 old = cur;
216 }
217 return false;
218}
219
220/* 174/*
221 * Called with rcu_read_lock() 175 * Called with rcu_read_lock()
222 * Because we hold no lock against a writer, its quite possible we fall 176 * Because we hold no lock against a writer, its quite possible we fall
@@ -225,8 +179,7 @@ static bool atomic_add_unless_return(atomic_t *ptr, int a, int u, int *newv)
225 * We exit from this function if number of links exceeds PEER_MAXDEPTH 179 * We exit from this function if number of links exceeds PEER_MAXDEPTH
226 */ 180 */
227static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr, 181static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr,
228 struct inet_peer_base *base, 182 struct inet_peer_base *base)
229 int *newrefcnt)
230{ 183{
231 struct inet_peer *u = rcu_dereference(base->root); 184 struct inet_peer *u = rcu_dereference(base->root);
232 int count = 0; 185 int count = 0;
@@ -235,11 +188,9 @@ static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr,
235 int cmp = addr_compare(daddr, &u->daddr); 188 int cmp = addr_compare(daddr, &u->daddr);
236 if (cmp == 0) { 189 if (cmp == 0) {
237 /* Before taking a reference, check if this entry was 190 /* Before taking a reference, check if this entry was
238 * deleted, unlink_from_pool() sets refcnt=-1 to make 191 * deleted (refcnt=-1)
239 * distinction between an unused entry (refcnt=0) and
240 * a freed one.
241 */ 192 */
242 if (!atomic_add_unless_return(&u->refcnt, 1, -1, newrefcnt)) 193 if (!atomic_add_unless(&u->refcnt, 1, -1))
243 u = NULL; 194 u = NULL;
244 return u; 195 return u;
245 } 196 }
@@ -366,137 +317,96 @@ static void inetpeer_free_rcu(struct rcu_head *head)
366 kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu)); 317 kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu));
367} 318}
368 319
369/* May be called with local BH enabled. */
370static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base, 320static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base,
371 struct inet_peer __rcu **stack[PEER_MAXDEPTH]) 321 struct inet_peer __rcu **stack[PEER_MAXDEPTH])
372{ 322{
373 int do_free; 323 struct inet_peer __rcu ***stackptr, ***delp;
374 324
375 do_free = 0; 325 if (lookup(&p->daddr, stack, base) != p)
376 326 BUG();
377 write_seqlock_bh(&base->lock); 327 delp = stackptr - 1; /* *delp[0] == p */
378 /* Check the reference counter. It was artificially incremented by 1 328 if (p->avl_left == peer_avl_empty_rcu) {
379 * in cleanup() function to prevent sudden disappearing. If we can 329 *delp[0] = p->avl_right;
380 * atomically (because of lockless readers) take this last reference, 330 --stackptr;
381 * it's safe to remove the node and free it later. 331 } else {
382 * We use refcnt=-1 to alert lockless readers this entry is deleted. 332 /* look for a node to insert instead of p */
383 */ 333 struct inet_peer *t;
384 if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) { 334 t = lookup_rightempty(p, base);
385 struct inet_peer __rcu ***stackptr, ***delp; 335 BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t);
386 if (lookup(&p->daddr, stack, base) != p) 336 **--stackptr = t->avl_left;
387 BUG(); 337 /* t is removed, t->daddr > x->daddr for any
388 delp = stackptr - 1; /* *delp[0] == p */ 338 * x in p->avl_left subtree.
389 if (p->avl_left == peer_avl_empty_rcu) { 339 * Put t in the old place of p. */
390 *delp[0] = p->avl_right; 340 RCU_INIT_POINTER(*delp[0], t);
391 --stackptr; 341 t->avl_left = p->avl_left;
392 } else { 342 t->avl_right = p->avl_right;
393 /* look for a node to insert instead of p */ 343 t->avl_height = p->avl_height;
394 struct inet_peer *t; 344 BUG_ON(delp[1] != &p->avl_left);
395 t = lookup_rightempty(p, base); 345 delp[1] = &t->avl_left; /* was &p->avl_left */
396 BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t);
397 **--stackptr = t->avl_left;
398 /* t is removed, t->daddr > x->daddr for any
399 * x in p->avl_left subtree.
400 * Put t in the old place of p. */
401 RCU_INIT_POINTER(*delp[0], t);
402 t->avl_left = p->avl_left;
403 t->avl_right = p->avl_right;
404 t->avl_height = p->avl_height;
405 BUG_ON(delp[1] != &p->avl_left);
406 delp[1] = &t->avl_left; /* was &p->avl_left */
407 }
408 peer_avl_rebalance(stack, stackptr, base);
409 base->total--;
410 do_free = 1;
411 } 346 }
412 write_sequnlock_bh(&base->lock); 347 peer_avl_rebalance(stack, stackptr, base);
413 348 base->total--;
414 if (do_free) 349 call_rcu(&p->rcu, inetpeer_free_rcu);
415 call_rcu(&p->rcu, inetpeer_free_rcu);
416 else
417 /* The node is used again. Decrease the reference counter
418 * back. The loop "cleanup -> unlink_from_unused
419 * -> unlink_from_pool -> putpeer -> link_to_unused
420 * -> cleanup (for the same node)"
421 * doesn't really exist because the entry will have a
422 * recent deletion time and will not be cleaned again soon.
423 */
424 inet_putpeer(p);
425} 350}
426 351
427static struct inet_peer_base *family_to_base(int family) 352static struct inet_peer_base *family_to_base(int family)
428{ 353{
429 return (family == AF_INET ? &v4_peers : &v6_peers); 354 return family == AF_INET ? &v4_peers : &v6_peers;
430} 355}
431 356
432static struct inet_peer_base *peer_to_base(struct inet_peer *p) 357/* perform garbage collect on all items stacked during a lookup */
358static int inet_peer_gc(struct inet_peer_base *base,
359 struct inet_peer __rcu **stack[PEER_MAXDEPTH],
360 struct inet_peer __rcu ***stackptr)
433{ 361{
434 return family_to_base(p->daddr.family); 362 struct inet_peer *p, *gchead = NULL;
435} 363 __u32 delta, ttl;
436 364 int cnt = 0;
437/* May be called with local BH enabled. */
438static int cleanup_once(unsigned long ttl, struct inet_peer __rcu **stack[PEER_MAXDEPTH])
439{
440 struct inet_peer *p = NULL;
441
442 /* Remove the first entry from the list of unused nodes. */
443 spin_lock_bh(&unused_peers.lock);
444 if (!list_empty(&unused_peers.list)) {
445 __u32 delta;
446 365
447 p = list_first_entry(&unused_peers.list, struct inet_peer, unused); 366 if (base->total >= inet_peer_threshold)
367 ttl = 0; /* be aggressive */
368 else
369 ttl = inet_peer_maxttl
370 - (inet_peer_maxttl - inet_peer_minttl) / HZ *
371 base->total / inet_peer_threshold * HZ;
372 stackptr--; /* last stack slot is peer_avl_empty */
373 while (stackptr > stack) {
374 stackptr--;
375 p = rcu_deref_locked(**stackptr, base);
448 delta = (__u32)jiffies - p->dtime; 376 delta = (__u32)jiffies - p->dtime;
449 377 if (atomic_read(&p->refcnt) == 0 && delta >= ttl &&
450 if (delta < ttl) { 378 atomic_cmpxchg(&p->refcnt, 0, -1) == 0) {
451 /* Do not prune fresh entries. */ 379 p->gc_next = gchead;
452 spin_unlock_bh(&unused_peers.lock); 380 gchead = p;
453 return -1;
454 } 381 }
455
456 list_del_init(&p->unused);
457
458 /* Grab an extra reference to prevent node disappearing
459 * before unlink_from_pool() call. */
460 atomic_inc(&p->refcnt);
461 } 382 }
462 spin_unlock_bh(&unused_peers.lock); 383 while ((p = gchead) != NULL) {
463 384 gchead = p->gc_next;
464 if (p == NULL) 385 cnt++;
465 /* It means that the total number of USED entries has 386 unlink_from_pool(p, base, stack);
466 * grown over inet_peer_threshold. It shouldn't really 387 }
467 * happen because of entry limits in route cache. */ 388 return cnt;
468 return -1;
469
470 unlink_from_pool(p, peer_to_base(p), stack);
471 return 0;
472} 389}
473 390
474/* Called with or without local BH being disabled. */
475struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) 391struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
476{ 392{
477 struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; 393 struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
478 struct inet_peer_base *base = family_to_base(daddr->family); 394 struct inet_peer_base *base = family_to_base(daddr->family);
479 struct inet_peer *p; 395 struct inet_peer *p;
480 unsigned int sequence; 396 unsigned int sequence;
481 int invalidated, newrefcnt = 0; 397 int invalidated, gccnt = 0;
482 398
483 /* Look up for the address quickly, lockless. 399 /* Attempt a lockless lookup first.
484 * Because of a concurrent writer, we might not find an existing entry. 400 * Because of a concurrent writer, we might not find an existing entry.
485 */ 401 */
486 rcu_read_lock(); 402 rcu_read_lock();
487 sequence = read_seqbegin(&base->lock); 403 sequence = read_seqbegin(&base->lock);
488 p = lookup_rcu(daddr, base, &newrefcnt); 404 p = lookup_rcu(daddr, base);
489 invalidated = read_seqretry(&base->lock, sequence); 405 invalidated = read_seqretry(&base->lock, sequence);
490 rcu_read_unlock(); 406 rcu_read_unlock();
491 407
492 if (p) { 408 if (p)
493found: /* The existing node has been found.
494 * Remove the entry from unused list if it was there.
495 */
496 if (newrefcnt == 1)
497 unlink_from_unused(p);
498 return p; 409 return p;
499 }
500 410
501 /* If no writer did a change during our lookup, we can return early. */ 411 /* If no writer did a change during our lookup, we can return early. */
502 if (!create && !invalidated) 412 if (!create && !invalidated)
@@ -506,11 +416,17 @@ found: /* The existing node has been found.
506 * At least, nodes should be hot in our cache. 416 * At least, nodes should be hot in our cache.
507 */ 417 */
508 write_seqlock_bh(&base->lock); 418 write_seqlock_bh(&base->lock);
419relookup:
509 p = lookup(daddr, stack, base); 420 p = lookup(daddr, stack, base);
510 if (p != peer_avl_empty) { 421 if (p != peer_avl_empty) {
511 newrefcnt = atomic_inc_return(&p->refcnt); 422 atomic_inc(&p->refcnt);
512 write_sequnlock_bh(&base->lock); 423 write_sequnlock_bh(&base->lock);
513 goto found; 424 return p;
425 }
426 if (!gccnt) {
427 gccnt = inet_peer_gc(base, stack, stackptr);
428 if (gccnt && create)
429 goto relookup;
514 } 430 }
515 p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL; 431 p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL;
516 if (p) { 432 if (p) {
@@ -525,7 +441,6 @@ found: /* The existing node has been found.
525 p->pmtu_expires = 0; 441 p->pmtu_expires = 0;
526 p->pmtu_orig = 0; 442 p->pmtu_orig = 0;
527 memset(&p->redirect_learned, 0, sizeof(p->redirect_learned)); 443 memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
528 INIT_LIST_HEAD(&p->unused);
529 444
530 445
531 /* Link the node. */ 446 /* Link the node. */
@@ -534,63 +449,14 @@ found: /* The existing node has been found.
534 } 449 }
535 write_sequnlock_bh(&base->lock); 450 write_sequnlock_bh(&base->lock);
536 451
537 if (base->total >= inet_peer_threshold)
538 /* Remove one less-recently-used entry. */
539 cleanup_once(0, stack);
540
541 return p; 452 return p;
542} 453}
543
544static int compute_total(void)
545{
546 return v4_peers.total + v6_peers.total;
547}
548EXPORT_SYMBOL_GPL(inet_getpeer); 454EXPORT_SYMBOL_GPL(inet_getpeer);
549 455
550/* Called with local BH disabled. */
551static void peer_check_expire(unsigned long dummy)
552{
553 unsigned long now = jiffies;
554 int ttl, total;
555 struct inet_peer __rcu **stack[PEER_MAXDEPTH];
556
557 total = compute_total();
558 if (total >= inet_peer_threshold)
559 ttl = inet_peer_minttl;
560 else
561 ttl = inet_peer_maxttl
562 - (inet_peer_maxttl - inet_peer_minttl) / HZ *
563 total / inet_peer_threshold * HZ;
564 while (!cleanup_once(ttl, stack)) {
565 if (jiffies != now)
566 break;
567 }
568
569 /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime
570 * interval depending on the total number of entries (more entries,
571 * less interval). */
572 total = compute_total();
573 if (total >= inet_peer_threshold)
574 peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime;
575 else
576 peer_periodic_timer.expires = jiffies
577 + inet_peer_gc_maxtime
578 - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
579 total / inet_peer_threshold * HZ;
580 add_timer(&peer_periodic_timer);
581}
582
583void inet_putpeer(struct inet_peer *p) 456void inet_putpeer(struct inet_peer *p)
584{ 457{
585 local_bh_disable(); 458 p->dtime = (__u32)jiffies;
586 459 atomic_dec(&p->refcnt);
587 if (atomic_dec_and_lock(&p->refcnt, &unused_peers.lock)) {
588 list_add_tail(&p->unused, &unused_peers.list);
589 p->dtime = (__u32)jiffies;
590 spin_unlock(&unused_peers.lock);
591 }
592
593 local_bh_enable();
594} 460}
595EXPORT_SYMBOL_GPL(inet_putpeer); 461EXPORT_SYMBOL_GPL(inet_putpeer);
596 462
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 57d0752e239a..69fd7201129a 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -398,20 +398,6 @@ static struct ctl_table ipv4_table[] = {
398 .proc_handler = proc_dointvec_jiffies, 398 .proc_handler = proc_dointvec_jiffies,
399 }, 399 },
400 { 400 {
401 .procname = "inet_peer_gc_mintime",
402 .data = &inet_peer_gc_mintime,
403 .maxlen = sizeof(int),
404 .mode = 0644,
405 .proc_handler = proc_dointvec_jiffies,
406 },
407 {
408 .procname = "inet_peer_gc_maxtime",
409 .data = &inet_peer_gc_maxtime,
410 .maxlen = sizeof(int),
411 .mode = 0644,
412 .proc_handler = proc_dointvec_jiffies,
413 },
414 {
415 .procname = "tcp_orphan_retries", 401 .procname = "tcp_orphan_retries",
416 .data = &sysctl_tcp_orphan_retries, 402 .data = &sysctl_tcp_orphan_retries,
417 .maxlen = sizeof(int), 403 .maxlen = sizeof(int),