diff options
Diffstat (limited to 'net/ipv4/inetpeer.c')
-rw-r--r-- | net/ipv4/inetpeer.c | 294 |
1 files changed, 84 insertions, 210 deletions
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index ce616d92cc5..86f13c67ea8 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/net.h> | 19 | #include <linux/net.h> |
20 | #include <net/ip.h> | 20 | #include <net/ip.h> |
21 | #include <net/inetpeer.h> | 21 | #include <net/inetpeer.h> |
22 | #include <net/secure_seq.h> | ||
22 | 23 | ||
23 | /* | 24 | /* |
24 | * Theory of operations. | 25 | * Theory of operations. |
@@ -54,15 +55,11 @@ | |||
54 | * 1. Nodes may appear in the tree only with the pool lock held. | 55 | * 1. Nodes may appear in the tree only with the pool lock held. |
55 | * 2. Nodes may disappear from the tree only with the pool lock held | 56 | * 2. Nodes may disappear from the tree only with the pool lock held |
56 | * AND reference count being 0. | 57 | * AND reference count being 0. |
57 | * 3. Nodes appears and disappears from unused node list only under | 58 | * 3. Global variable peer_total is modified under the pool lock. |
58 | * "inet_peer_unused_lock". | 59 | * 4. struct inet_peer fields modification: |
59 | * 4. Global variable peer_total is modified under the pool lock. | ||
60 | * 5. struct inet_peer fields modification: | ||
61 | * avl_left, avl_right, avl_parent, avl_height: pool lock | 60 | * avl_left, avl_right, avl_parent, avl_height: pool lock |
62 | * unused: unused node list lock | ||
63 | * refcnt: atomically against modifications on other CPU; | 61 | * refcnt: atomically against modifications on other CPU; |
64 | * usually under some other lock to prevent node disappearing | 62 | * usually under some other lock to prevent node disappearing |
65 | * dtime: unused node list lock | ||
66 | * daddr: unchangeable | 63 | * daddr: unchangeable |
67 | * ip_id_count: atomic value (no lock needed) | 64 | * ip_id_count: atomic value (no lock needed) |
68 | */ | 65 | */ |
@@ -104,19 +101,6 @@ int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries m | |||
104 | * aggressively at this stage */ | 101 | * aggressively at this stage */ |
105 | int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ | 102 | int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ |
106 | int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */ | 103 | int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */ |
107 | int inet_peer_gc_mintime __read_mostly = 10 * HZ; | ||
108 | int inet_peer_gc_maxtime __read_mostly = 120 * HZ; | ||
109 | |||
110 | static struct { | ||
111 | struct list_head list; | ||
112 | spinlock_t lock; | ||
113 | } unused_peers = { | ||
114 | .list = LIST_HEAD_INIT(unused_peers.list), | ||
115 | .lock = __SPIN_LOCK_UNLOCKED(unused_peers.lock), | ||
116 | }; | ||
117 | |||
118 | static void peer_check_expire(unsigned long dummy); | ||
119 | static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0); | ||
120 | 104 | ||
121 | 105 | ||
122 | /* Called from ip_output.c:ip_init */ | 106 | /* Called from ip_output.c:ip_init */ |
@@ -142,21 +126,6 @@ void __init inet_initpeers(void) | |||
142 | 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, | 126 | 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, |
143 | NULL); | 127 | NULL); |
144 | 128 | ||
145 | /* All the timers, started at system startup tend | ||
146 | to synchronize. Perturb it a bit. | ||
147 | */ | ||
148 | peer_periodic_timer.expires = jiffies | ||
149 | + net_random() % inet_peer_gc_maxtime | ||
150 | + inet_peer_gc_maxtime; | ||
151 | add_timer(&peer_periodic_timer); | ||
152 | } | ||
153 | |||
154 | /* Called with or without local BH being disabled. */ | ||
155 | static void unlink_from_unused(struct inet_peer *p) | ||
156 | { | ||
157 | spin_lock_bh(&unused_peers.lock); | ||
158 | list_del_init(&p->unused); | ||
159 | spin_unlock_bh(&unused_peers.lock); | ||
160 | } | 129 | } |
161 | 130 | ||
162 | static int addr_compare(const struct inetpeer_addr *a, | 131 | static int addr_compare(const struct inetpeer_addr *a, |
@@ -203,20 +172,6 @@ static int addr_compare(const struct inetpeer_addr *a, | |||
203 | u; \ | 172 | u; \ |
204 | }) | 173 | }) |
205 | 174 | ||
206 | static bool atomic_add_unless_return(atomic_t *ptr, int a, int u, int *newv) | ||
207 | { | ||
208 | int cur, old = atomic_read(ptr); | ||
209 | |||
210 | while (old != u) { | ||
211 | *newv = old + a; | ||
212 | cur = atomic_cmpxchg(ptr, old, *newv); | ||
213 | if (cur == old) | ||
214 | return true; | ||
215 | old = cur; | ||
216 | } | ||
217 | return false; | ||
218 | } | ||
219 | |||
220 | /* | 175 | /* |
221 | * Called with rcu_read_lock() | 176 | * Called with rcu_read_lock() |
222 | * Because we hold no lock against a writer, its quite possible we fall | 177 | * Because we hold no lock against a writer, its quite possible we fall |
@@ -225,8 +180,7 @@ static bool atomic_add_unless_return(atomic_t *ptr, int a, int u, int *newv) | |||
225 | * We exit from this function if number of links exceeds PEER_MAXDEPTH | 180 | * We exit from this function if number of links exceeds PEER_MAXDEPTH |
226 | */ | 181 | */ |
227 | static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr, | 182 | static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr, |
228 | struct inet_peer_base *base, | 183 | struct inet_peer_base *base) |
229 | int *newrefcnt) | ||
230 | { | 184 | { |
231 | struct inet_peer *u = rcu_dereference(base->root); | 185 | struct inet_peer *u = rcu_dereference(base->root); |
232 | int count = 0; | 186 | int count = 0; |
@@ -235,11 +189,9 @@ static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr, | |||
235 | int cmp = addr_compare(daddr, &u->daddr); | 189 | int cmp = addr_compare(daddr, &u->daddr); |
236 | if (cmp == 0) { | 190 | if (cmp == 0) { |
237 | /* Before taking a reference, check if this entry was | 191 | /* Before taking a reference, check if this entry was |
238 | * deleted, unlink_from_pool() sets refcnt=-1 to make | 192 | * deleted (refcnt=-1) |
239 | * distinction between an unused entry (refcnt=0) and | ||
240 | * a freed one. | ||
241 | */ | 193 | */ |
242 | if (!atomic_add_unless_return(&u->refcnt, 1, -1, newrefcnt)) | 194 | if (!atomic_add_unless(&u->refcnt, 1, -1)) |
243 | u = NULL; | 195 | u = NULL; |
244 | return u; | 196 | return u; |
245 | } | 197 | } |
@@ -366,137 +318,99 @@ static void inetpeer_free_rcu(struct rcu_head *head) | |||
366 | kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu)); | 318 | kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu)); |
367 | } | 319 | } |
368 | 320 | ||
369 | /* May be called with local BH enabled. */ | ||
370 | static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base, | 321 | static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base, |
371 | struct inet_peer __rcu **stack[PEER_MAXDEPTH]) | 322 | struct inet_peer __rcu **stack[PEER_MAXDEPTH]) |
372 | { | 323 | { |
373 | int do_free; | 324 | struct inet_peer __rcu ***stackptr, ***delp; |
374 | 325 | ||
375 | do_free = 0; | 326 | if (lookup(&p->daddr, stack, base) != p) |
376 | 327 | BUG(); | |
377 | write_seqlock_bh(&base->lock); | 328 | delp = stackptr - 1; /* *delp[0] == p */ |
378 | /* Check the reference counter. It was artificially incremented by 1 | 329 | if (p->avl_left == peer_avl_empty_rcu) { |
379 | * in cleanup() function to prevent sudden disappearing. If we can | 330 | *delp[0] = p->avl_right; |
380 | * atomically (because of lockless readers) take this last reference, | 331 | --stackptr; |
381 | * it's safe to remove the node and free it later. | 332 | } else { |
382 | * We use refcnt=-1 to alert lockless readers this entry is deleted. | 333 | /* look for a node to insert instead of p */ |
383 | */ | 334 | struct inet_peer *t; |
384 | if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) { | 335 | t = lookup_rightempty(p, base); |
385 | struct inet_peer __rcu ***stackptr, ***delp; | 336 | BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t); |
386 | if (lookup(&p->daddr, stack, base) != p) | 337 | **--stackptr = t->avl_left; |
387 | BUG(); | 338 | /* t is removed, t->daddr > x->daddr for any |
388 | delp = stackptr - 1; /* *delp[0] == p */ | 339 | * x in p->avl_left subtree. |
389 | if (p->avl_left == peer_avl_empty_rcu) { | 340 | * Put t in the old place of p. */ |
390 | *delp[0] = p->avl_right; | 341 | RCU_INIT_POINTER(*delp[0], t); |
391 | --stackptr; | 342 | t->avl_left = p->avl_left; |
392 | } else { | 343 | t->avl_right = p->avl_right; |
393 | /* look for a node to insert instead of p */ | 344 | t->avl_height = p->avl_height; |
394 | struct inet_peer *t; | 345 | BUG_ON(delp[1] != &p->avl_left); |
395 | t = lookup_rightempty(p, base); | 346 | delp[1] = &t->avl_left; /* was &p->avl_left */ |
396 | BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t); | ||
397 | **--stackptr = t->avl_left; | ||
398 | /* t is removed, t->daddr > x->daddr for any | ||
399 | * x in p->avl_left subtree. | ||
400 | * Put t in the old place of p. */ | ||
401 | RCU_INIT_POINTER(*delp[0], t); | ||
402 | t->avl_left = p->avl_left; | ||
403 | t->avl_right = p->avl_right; | ||
404 | t->avl_height = p->avl_height; | ||
405 | BUG_ON(delp[1] != &p->avl_left); | ||
406 | delp[1] = &t->avl_left; /* was &p->avl_left */ | ||
407 | } | ||
408 | peer_avl_rebalance(stack, stackptr, base); | ||
409 | base->total--; | ||
410 | do_free = 1; | ||
411 | } | 347 | } |
412 | write_sequnlock_bh(&base->lock); | 348 | peer_avl_rebalance(stack, stackptr, base); |
413 | 349 | base->total--; | |
414 | if (do_free) | 350 | call_rcu(&p->rcu, inetpeer_free_rcu); |
415 | call_rcu(&p->rcu, inetpeer_free_rcu); | ||
416 | else | ||
417 | /* The node is used again. Decrease the reference counter | ||
418 | * back. The loop "cleanup -> unlink_from_unused | ||
419 | * -> unlink_from_pool -> putpeer -> link_to_unused | ||
420 | * -> cleanup (for the same node)" | ||
421 | * doesn't really exist because the entry will have a | ||
422 | * recent deletion time and will not be cleaned again soon. | ||
423 | */ | ||
424 | inet_putpeer(p); | ||
425 | } | 351 | } |
426 | 352 | ||
427 | static struct inet_peer_base *family_to_base(int family) | 353 | static struct inet_peer_base *family_to_base(int family) |
428 | { | 354 | { |
429 | return (family == AF_INET ? &v4_peers : &v6_peers); | 355 | return family == AF_INET ? &v4_peers : &v6_peers; |
430 | } | ||
431 | |||
432 | static struct inet_peer_base *peer_to_base(struct inet_peer *p) | ||
433 | { | ||
434 | return family_to_base(p->daddr.family); | ||
435 | } | 356 | } |
436 | 357 | ||
437 | /* May be called with local BH enabled. */ | 358 | /* perform garbage collect on all items stacked during a lookup */ |
438 | static int cleanup_once(unsigned long ttl, struct inet_peer __rcu **stack[PEER_MAXDEPTH]) | 359 | static int inet_peer_gc(struct inet_peer_base *base, |
360 | struct inet_peer __rcu **stack[PEER_MAXDEPTH], | ||
361 | struct inet_peer __rcu ***stackptr) | ||
439 | { | 362 | { |
440 | struct inet_peer *p = NULL; | 363 | struct inet_peer *p, *gchead = NULL; |
441 | 364 | __u32 delta, ttl; | |
442 | /* Remove the first entry from the list of unused nodes. */ | 365 | int cnt = 0; |
443 | spin_lock_bh(&unused_peers.lock); | ||
444 | if (!list_empty(&unused_peers.list)) { | ||
445 | __u32 delta; | ||
446 | |||
447 | p = list_first_entry(&unused_peers.list, struct inet_peer, unused); | ||
448 | delta = (__u32)jiffies - p->dtime; | ||
449 | 366 | ||
450 | if (delta < ttl) { | 367 | if (base->total >= inet_peer_threshold) |
451 | /* Do not prune fresh entries. */ | 368 | ttl = 0; /* be aggressive */ |
452 | spin_unlock_bh(&unused_peers.lock); | 369 | else |
453 | return -1; | 370 | ttl = inet_peer_maxttl |
371 | - (inet_peer_maxttl - inet_peer_minttl) / HZ * | ||
372 | base->total / inet_peer_threshold * HZ; | ||
373 | stackptr--; /* last stack slot is peer_avl_empty */ | ||
374 | while (stackptr > stack) { | ||
375 | stackptr--; | ||
376 | p = rcu_deref_locked(**stackptr, base); | ||
377 | if (atomic_read(&p->refcnt) == 0) { | ||
378 | smp_rmb(); | ||
379 | delta = (__u32)jiffies - p->dtime; | ||
380 | if (delta >= ttl && | ||
381 | atomic_cmpxchg(&p->refcnt, 0, -1) == 0) { | ||
382 | p->gc_next = gchead; | ||
383 | gchead = p; | ||
384 | } | ||
454 | } | 385 | } |
455 | |||
456 | list_del_init(&p->unused); | ||
457 | |||
458 | /* Grab an extra reference to prevent node disappearing | ||
459 | * before unlink_from_pool() call. */ | ||
460 | atomic_inc(&p->refcnt); | ||
461 | } | 386 | } |
462 | spin_unlock_bh(&unused_peers.lock); | 387 | while ((p = gchead) != NULL) { |
463 | 388 | gchead = p->gc_next; | |
464 | if (p == NULL) | 389 | cnt++; |
465 | /* It means that the total number of USED entries has | 390 | unlink_from_pool(p, base, stack); |
466 | * grown over inet_peer_threshold. It shouldn't really | 391 | } |
467 | * happen because of entry limits in route cache. */ | 392 | return cnt; |
468 | return -1; | ||
469 | |||
470 | unlink_from_pool(p, peer_to_base(p), stack); | ||
471 | return 0; | ||
472 | } | 393 | } |
473 | 394 | ||
474 | /* Called with or without local BH being disabled. */ | 395 | struct inet_peer *inet_getpeer(const struct inetpeer_addr *daddr, int create) |
475 | struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) | ||
476 | { | 396 | { |
477 | struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; | 397 | struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; |
478 | struct inet_peer_base *base = family_to_base(daddr->family); | 398 | struct inet_peer_base *base = family_to_base(daddr->family); |
479 | struct inet_peer *p; | 399 | struct inet_peer *p; |
480 | unsigned int sequence; | 400 | unsigned int sequence; |
481 | int invalidated, newrefcnt = 0; | 401 | int invalidated, gccnt = 0; |
482 | 402 | ||
483 | /* Look up for the address quickly, lockless. | 403 | /* Attempt a lockless lookup first. |
484 | * Because of a concurrent writer, we might not find an existing entry. | 404 | * Because of a concurrent writer, we might not find an existing entry. |
485 | */ | 405 | */ |
486 | rcu_read_lock(); | 406 | rcu_read_lock(); |
487 | sequence = read_seqbegin(&base->lock); | 407 | sequence = read_seqbegin(&base->lock); |
488 | p = lookup_rcu(daddr, base, &newrefcnt); | 408 | p = lookup_rcu(daddr, base); |
489 | invalidated = read_seqretry(&base->lock, sequence); | 409 | invalidated = read_seqretry(&base->lock, sequence); |
490 | rcu_read_unlock(); | 410 | rcu_read_unlock(); |
491 | 411 | ||
492 | if (p) { | 412 | if (p) |
493 | found: /* The existing node has been found. | ||
494 | * Remove the entry from unused list if it was there. | ||
495 | */ | ||
496 | if (newrefcnt == 1) | ||
497 | unlink_from_unused(p); | ||
498 | return p; | 413 | return p; |
499 | } | ||
500 | 414 | ||
501 | /* If no writer did a change during our lookup, we can return early. */ | 415 | /* If no writer did a change during our lookup, we can return early. */ |
502 | if (!create && !invalidated) | 416 | if (!create && !invalidated) |
@@ -506,18 +420,27 @@ found: /* The existing node has been found. | |||
506 | * At least, nodes should be hot in our cache. | 420 | * At least, nodes should be hot in our cache. |
507 | */ | 421 | */ |
508 | write_seqlock_bh(&base->lock); | 422 | write_seqlock_bh(&base->lock); |
423 | relookup: | ||
509 | p = lookup(daddr, stack, base); | 424 | p = lookup(daddr, stack, base); |
510 | if (p != peer_avl_empty) { | 425 | if (p != peer_avl_empty) { |
511 | newrefcnt = atomic_inc_return(&p->refcnt); | 426 | atomic_inc(&p->refcnt); |
512 | write_sequnlock_bh(&base->lock); | 427 | write_sequnlock_bh(&base->lock); |
513 | goto found; | 428 | return p; |
429 | } | ||
430 | if (!gccnt) { | ||
431 | gccnt = inet_peer_gc(base, stack, stackptr); | ||
432 | if (gccnt && create) | ||
433 | goto relookup; | ||
514 | } | 434 | } |
515 | p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL; | 435 | p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL; |
516 | if (p) { | 436 | if (p) { |
517 | p->daddr = *daddr; | 437 | p->daddr = *daddr; |
518 | atomic_set(&p->refcnt, 1); | 438 | atomic_set(&p->refcnt, 1); |
519 | atomic_set(&p->rid, 0); | 439 | atomic_set(&p->rid, 0); |
520 | atomic_set(&p->ip_id_count, secure_ip_id(daddr->addr.a4)); | 440 | atomic_set(&p->ip_id_count, |
441 | (daddr->family == AF_INET) ? | ||
442 | secure_ip_id(daddr->addr.a4) : | ||
443 | secure_ipv6_id(daddr->addr.a6)); | ||
521 | p->tcp_ts_stamp = 0; | 444 | p->tcp_ts_stamp = 0; |
522 | p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; | 445 | p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; |
523 | p->rate_tokens = 0; | 446 | p->rate_tokens = 0; |
@@ -525,7 +448,6 @@ found: /* The existing node has been found. | |||
525 | p->pmtu_expires = 0; | 448 | p->pmtu_expires = 0; |
526 | p->pmtu_orig = 0; | 449 | p->pmtu_orig = 0; |
527 | memset(&p->redirect_learned, 0, sizeof(p->redirect_learned)); | 450 | memset(&p->redirect_learned, 0, sizeof(p->redirect_learned)); |
528 | INIT_LIST_HEAD(&p->unused); | ||
529 | 451 | ||
530 | 452 | ||
531 | /* Link the node. */ | 453 | /* Link the node. */ |
@@ -534,63 +456,15 @@ found: /* The existing node has been found. | |||
534 | } | 456 | } |
535 | write_sequnlock_bh(&base->lock); | 457 | write_sequnlock_bh(&base->lock); |
536 | 458 | ||
537 | if (base->total >= inet_peer_threshold) | ||
538 | /* Remove one less-recently-used entry. */ | ||
539 | cleanup_once(0, stack); | ||
540 | |||
541 | return p; | 459 | return p; |
542 | } | 460 | } |
543 | |||
544 | static int compute_total(void) | ||
545 | { | ||
546 | return v4_peers.total + v6_peers.total; | ||
547 | } | ||
548 | EXPORT_SYMBOL_GPL(inet_getpeer); | 461 | EXPORT_SYMBOL_GPL(inet_getpeer); |
549 | 462 | ||
550 | /* Called with local BH disabled. */ | ||
551 | static void peer_check_expire(unsigned long dummy) | ||
552 | { | ||
553 | unsigned long now = jiffies; | ||
554 | int ttl, total; | ||
555 | struct inet_peer __rcu **stack[PEER_MAXDEPTH]; | ||
556 | |||
557 | total = compute_total(); | ||
558 | if (total >= inet_peer_threshold) | ||
559 | ttl = inet_peer_minttl; | ||
560 | else | ||
561 | ttl = inet_peer_maxttl | ||
562 | - (inet_peer_maxttl - inet_peer_minttl) / HZ * | ||
563 | total / inet_peer_threshold * HZ; | ||
564 | while (!cleanup_once(ttl, stack)) { | ||
565 | if (jiffies != now) | ||
566 | break; | ||
567 | } | ||
568 | |||
569 | /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime | ||
570 | * interval depending on the total number of entries (more entries, | ||
571 | * less interval). */ | ||
572 | total = compute_total(); | ||
573 | if (total >= inet_peer_threshold) | ||
574 | peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime; | ||
575 | else | ||
576 | peer_periodic_timer.expires = jiffies | ||
577 | + inet_peer_gc_maxtime | ||
578 | - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ * | ||
579 | total / inet_peer_threshold * HZ; | ||
580 | add_timer(&peer_periodic_timer); | ||
581 | } | ||
582 | |||
583 | void inet_putpeer(struct inet_peer *p) | 463 | void inet_putpeer(struct inet_peer *p) |
584 | { | 464 | { |
585 | local_bh_disable(); | 465 | p->dtime = (__u32)jiffies; |
586 | 466 | smp_mb__before_atomic_dec(); | |
587 | if (atomic_dec_and_lock(&p->refcnt, &unused_peers.lock)) { | 467 | atomic_dec(&p->refcnt); |
588 | list_add_tail(&p->unused, &unused_peers.list); | ||
589 | p->dtime = (__u32)jiffies; | ||
590 | spin_unlock(&unused_peers.lock); | ||
591 | } | ||
592 | |||
593 | local_bh_enable(); | ||
594 | } | 468 | } |
595 | EXPORT_SYMBOL_GPL(inet_putpeer); | 469 | EXPORT_SYMBOL_GPL(inet_putpeer); |
596 | 470 | ||