aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/inetpeer.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/inetpeer.c')
-rw-r--r--net/ipv4/inetpeer.c161
1 files changed, 102 insertions, 59 deletions
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index a96e65674ac3..9df4e635fb5f 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -81,19 +81,19 @@ static const struct inet_peer peer_fake_node = {
81 81
82struct inet_peer_base { 82struct inet_peer_base {
83 struct inet_peer __rcu *root; 83 struct inet_peer __rcu *root;
84 spinlock_t lock; 84 seqlock_t lock;
85 int total; 85 int total;
86}; 86};
87 87
88static struct inet_peer_base v4_peers = { 88static struct inet_peer_base v4_peers = {
89 .root = peer_avl_empty_rcu, 89 .root = peer_avl_empty_rcu,
90 .lock = __SPIN_LOCK_UNLOCKED(v4_peers.lock), 90 .lock = __SEQLOCK_UNLOCKED(v4_peers.lock),
91 .total = 0, 91 .total = 0,
92}; 92};
93 93
94static struct inet_peer_base v6_peers = { 94static struct inet_peer_base v6_peers = {
95 .root = peer_avl_empty_rcu, 95 .root = peer_avl_empty_rcu,
96 .lock = __SPIN_LOCK_UNLOCKED(v6_peers.lock), 96 .lock = __SEQLOCK_UNLOCKED(v6_peers.lock),
97 .total = 0, 97 .total = 0,
98}; 98};
99 99
@@ -167,9 +167,9 @@ static int addr_compare(const struct inetpeer_addr *a,
167 int i, n = (a->family == AF_INET ? 1 : 4); 167 int i, n = (a->family == AF_INET ? 1 : 4);
168 168
169 for (i = 0; i < n; i++) { 169 for (i = 0; i < n; i++) {
170 if (a->a6[i] == b->a6[i]) 170 if (a->addr.a6[i] == b->addr.a6[i])
171 continue; 171 continue;
172 if (a->a6[i] < b->a6[i]) 172 if (a->addr.a6[i] < b->addr.a6[i])
173 return -1; 173 return -1;
174 return 1; 174 return 1;
175 } 175 }
@@ -177,6 +177,9 @@ static int addr_compare(const struct inetpeer_addr *a,
177 return 0; 177 return 0;
178} 178}
179 179
180#define rcu_deref_locked(X, BASE) \
181 rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock))
182
180/* 183/*
181 * Called with local BH disabled and the pool lock held. 184 * Called with local BH disabled and the pool lock held.
182 */ 185 */
@@ -187,8 +190,7 @@ static int addr_compare(const struct inetpeer_addr *a,
187 \ 190 \
188 stackptr = _stack; \ 191 stackptr = _stack; \
189 *stackptr++ = &_base->root; \ 192 *stackptr++ = &_base->root; \
190 for (u = rcu_dereference_protected(_base->root, \ 193 for (u = rcu_deref_locked(_base->root, _base); \
191 lockdep_is_held(&_base->lock)); \
192 u != peer_avl_empty; ) { \ 194 u != peer_avl_empty; ) { \
193 int cmp = addr_compare(_daddr, &u->daddr); \ 195 int cmp = addr_compare(_daddr, &u->daddr); \
194 if (cmp == 0) \ 196 if (cmp == 0) \
@@ -198,23 +200,22 @@ static int addr_compare(const struct inetpeer_addr *a,
198 else \ 200 else \
199 v = &u->avl_right; \ 201 v = &u->avl_right; \
200 *stackptr++ = v; \ 202 *stackptr++ = v; \
201 u = rcu_dereference_protected(*v, \ 203 u = rcu_deref_locked(*v, _base); \
202 lockdep_is_held(&_base->lock)); \
203 } \ 204 } \
204 u; \ 205 u; \
205}) 206})
206 207
207/* 208/*
208 * Called with rcu_read_lock_bh() 209 * Called with rcu_read_lock()
209 * Because we hold no lock against a writer, its quite possible we fall 210 * Because we hold no lock against a writer, its quite possible we fall
210 * in an endless loop. 211 * in an endless loop.
211 * But every pointer we follow is guaranteed to be valid thanks to RCU. 212 * But every pointer we follow is guaranteed to be valid thanks to RCU.
212 * We exit from this function if number of links exceeds PEER_MAXDEPTH 213 * We exit from this function if number of links exceeds PEER_MAXDEPTH
213 */ 214 */
214static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr, 215static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr,
215 struct inet_peer_base *base) 216 struct inet_peer_base *base)
216{ 217{
217 struct inet_peer *u = rcu_dereference_bh(base->root); 218 struct inet_peer *u = rcu_dereference(base->root);
218 int count = 0; 219 int count = 0;
219 220
220 while (u != peer_avl_empty) { 221 while (u != peer_avl_empty) {
@@ -230,9 +231,9 @@ static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr,
230 return u; 231 return u;
231 } 232 }
232 if (cmp == -1) 233 if (cmp == -1)
233 u = rcu_dereference_bh(u->avl_left); 234 u = rcu_dereference(u->avl_left);
234 else 235 else
235 u = rcu_dereference_bh(u->avl_right); 236 u = rcu_dereference(u->avl_right);
236 if (unlikely(++count == PEER_MAXDEPTH)) 237 if (unlikely(++count == PEER_MAXDEPTH))
237 break; 238 break;
238 } 239 }
@@ -246,13 +247,11 @@ static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr,
246 struct inet_peer __rcu **v; \ 247 struct inet_peer __rcu **v; \
247 *stackptr++ = &start->avl_left; \ 248 *stackptr++ = &start->avl_left; \
248 v = &start->avl_left; \ 249 v = &start->avl_left; \
249 for (u = rcu_dereference_protected(*v, \ 250 for (u = rcu_deref_locked(*v, base); \
250 lockdep_is_held(&base->lock)); \
251 u->avl_right != peer_avl_empty_rcu; ) { \ 251 u->avl_right != peer_avl_empty_rcu; ) { \
252 v = &u->avl_right; \ 252 v = &u->avl_right; \
253 *stackptr++ = v; \ 253 *stackptr++ = v; \
254 u = rcu_dereference_protected(*v, \ 254 u = rcu_deref_locked(*v, base); \
255 lockdep_is_held(&base->lock)); \
256 } \ 255 } \
257 u; \ 256 u; \
258}) 257})
@@ -271,21 +270,16 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
271 270
272 while (stackend > stack) { 271 while (stackend > stack) {
273 nodep = *--stackend; 272 nodep = *--stackend;
274 node = rcu_dereference_protected(*nodep, 273 node = rcu_deref_locked(*nodep, base);
275 lockdep_is_held(&base->lock)); 274 l = rcu_deref_locked(node->avl_left, base);
276 l = rcu_dereference_protected(node->avl_left, 275 r = rcu_deref_locked(node->avl_right, base);
277 lockdep_is_held(&base->lock));
278 r = rcu_dereference_protected(node->avl_right,
279 lockdep_is_held(&base->lock));
280 lh = node_height(l); 276 lh = node_height(l);
281 rh = node_height(r); 277 rh = node_height(r);
282 if (lh > rh + 1) { /* l: RH+2 */ 278 if (lh > rh + 1) { /* l: RH+2 */
283 struct inet_peer *ll, *lr, *lrl, *lrr; 279 struct inet_peer *ll, *lr, *lrl, *lrr;
284 int lrh; 280 int lrh;
285 ll = rcu_dereference_protected(l->avl_left, 281 ll = rcu_deref_locked(l->avl_left, base);
286 lockdep_is_held(&base->lock)); 282 lr = rcu_deref_locked(l->avl_right, base);
287 lr = rcu_dereference_protected(l->avl_right,
288 lockdep_is_held(&base->lock));
289 lrh = node_height(lr); 283 lrh = node_height(lr);
290 if (lrh <= node_height(ll)) { /* ll: RH+1 */ 284 if (lrh <= node_height(ll)) { /* ll: RH+1 */
291 RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */ 285 RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */
@@ -296,10 +290,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
296 l->avl_height = node->avl_height + 1; 290 l->avl_height = node->avl_height + 1;
297 RCU_INIT_POINTER(*nodep, l); 291 RCU_INIT_POINTER(*nodep, l);
298 } else { /* ll: RH, lr: RH+1 */ 292 } else { /* ll: RH, lr: RH+1 */
299 lrl = rcu_dereference_protected(lr->avl_left, 293 lrl = rcu_deref_locked(lr->avl_left, base);/* lrl: RH or RH-1 */
300 lockdep_is_held(&base->lock)); /* lrl: RH or RH-1 */ 294 lrr = rcu_deref_locked(lr->avl_right, base);/* lrr: RH or RH-1 */
301 lrr = rcu_dereference_protected(lr->avl_right,
302 lockdep_is_held(&base->lock)); /* lrr: RH or RH-1 */
303 RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */ 295 RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */
304 RCU_INIT_POINTER(node->avl_right, r); /* r: RH */ 296 RCU_INIT_POINTER(node->avl_right, r); /* r: RH */
305 node->avl_height = rh + 1; /* node: RH+1 */ 297 node->avl_height = rh + 1; /* node: RH+1 */
@@ -314,10 +306,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
314 } else if (rh > lh + 1) { /* r: LH+2 */ 306 } else if (rh > lh + 1) { /* r: LH+2 */
315 struct inet_peer *rr, *rl, *rlr, *rll; 307 struct inet_peer *rr, *rl, *rlr, *rll;
316 int rlh; 308 int rlh;
317 rr = rcu_dereference_protected(r->avl_right, 309 rr = rcu_deref_locked(r->avl_right, base);
318 lockdep_is_held(&base->lock)); 310 rl = rcu_deref_locked(r->avl_left, base);
319 rl = rcu_dereference_protected(r->avl_left,
320 lockdep_is_held(&base->lock));
321 rlh = node_height(rl); 311 rlh = node_height(rl);
322 if (rlh <= node_height(rr)) { /* rr: LH+1 */ 312 if (rlh <= node_height(rr)) { /* rr: LH+1 */
323 RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */ 313 RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */
@@ -328,10 +318,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
328 r->avl_height = node->avl_height + 1; 318 r->avl_height = node->avl_height + 1;
329 RCU_INIT_POINTER(*nodep, r); 319 RCU_INIT_POINTER(*nodep, r);
330 } else { /* rr: RH, rl: RH+1 */ 320 } else { /* rr: RH, rl: RH+1 */
331 rlr = rcu_dereference_protected(rl->avl_right, 321 rlr = rcu_deref_locked(rl->avl_right, base);/* rlr: LH or LH-1 */
332 lockdep_is_held(&base->lock)); /* rlr: LH or LH-1 */ 322 rll = rcu_deref_locked(rl->avl_left, base);/* rll: LH or LH-1 */
333 rll = rcu_dereference_protected(rl->avl_left,
334 lockdep_is_held(&base->lock)); /* rll: LH or LH-1 */
335 RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */ 323 RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */
336 RCU_INIT_POINTER(node->avl_left, l); /* l: LH */ 324 RCU_INIT_POINTER(node->avl_left, l); /* l: LH */
337 node->avl_height = lh + 1; /* node: LH+1 */ 325 node->avl_height = lh + 1; /* node: LH+1 */
@@ -366,13 +354,14 @@ static void inetpeer_free_rcu(struct rcu_head *head)
366} 354}
367 355
368/* May be called with local BH enabled. */ 356/* May be called with local BH enabled. */
369static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base) 357static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base,
358 struct inet_peer __rcu **stack[PEER_MAXDEPTH])
370{ 359{
371 int do_free; 360 int do_free;
372 361
373 do_free = 0; 362 do_free = 0;
374 363
375 spin_lock_bh(&base->lock); 364 write_seqlock_bh(&base->lock);
376 /* Check the reference counter. It was artificially incremented by 1 365 /* Check the reference counter. It was artificially incremented by 1
377 * in cleanup() function to prevent sudden disappearing. If we can 366 * in cleanup() function to prevent sudden disappearing. If we can
378 * atomically (because of lockless readers) take this last reference, 367 * atomically (because of lockless readers) take this last reference,
@@ -380,7 +369,6 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base)
380 * We use refcnt=-1 to alert lockless readers this entry is deleted. 369 * We use refcnt=-1 to alert lockless readers this entry is deleted.
381 */ 370 */
382 if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) { 371 if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) {
383 struct inet_peer __rcu **stack[PEER_MAXDEPTH];
384 struct inet_peer __rcu ***stackptr, ***delp; 372 struct inet_peer __rcu ***stackptr, ***delp;
385 if (lookup(&p->daddr, stack, base) != p) 373 if (lookup(&p->daddr, stack, base) != p)
386 BUG(); 374 BUG();
@@ -392,8 +380,7 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base)
392 /* look for a node to insert instead of p */ 380 /* look for a node to insert instead of p */
393 struct inet_peer *t; 381 struct inet_peer *t;
394 t = lookup_rightempty(p, base); 382 t = lookup_rightempty(p, base);
395 BUG_ON(rcu_dereference_protected(*stackptr[-1], 383 BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t);
396 lockdep_is_held(&base->lock)) != t);
397 **--stackptr = t->avl_left; 384 **--stackptr = t->avl_left;
398 /* t is removed, t->daddr > x->daddr for any 385 /* t is removed, t->daddr > x->daddr for any
399 * x in p->avl_left subtree. 386 * x in p->avl_left subtree.
@@ -409,10 +396,10 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base)
409 base->total--; 396 base->total--;
410 do_free = 1; 397 do_free = 1;
411 } 398 }
412 spin_unlock_bh(&base->lock); 399 write_sequnlock_bh(&base->lock);
413 400
414 if (do_free) 401 if (do_free)
415 call_rcu_bh(&p->rcu, inetpeer_free_rcu); 402 call_rcu(&p->rcu, inetpeer_free_rcu);
416 else 403 else
417 /* The node is used again. Decrease the reference counter 404 /* The node is used again. Decrease the reference counter
418 * back. The loop "cleanup -> unlink_from_unused 405 * back. The loop "cleanup -> unlink_from_unused
@@ -435,7 +422,7 @@ static struct inet_peer_base *peer_to_base(struct inet_peer *p)
435} 422}
436 423
437/* May be called with local BH enabled. */ 424/* May be called with local BH enabled. */
438static int cleanup_once(unsigned long ttl) 425static int cleanup_once(unsigned long ttl, struct inet_peer __rcu **stack[PEER_MAXDEPTH])
439{ 426{
440 struct inet_peer *p = NULL; 427 struct inet_peer *p = NULL;
441 428
@@ -467,7 +454,7 @@ static int cleanup_once(unsigned long ttl)
467 * happen because of entry limits in route cache. */ 454 * happen because of entry limits in route cache. */
468 return -1; 455 return -1;
469 456
470 unlink_from_pool(p, peer_to_base(p)); 457 unlink_from_pool(p, peer_to_base(p), stack);
471 return 0; 458 return 0;
472} 459}
473 460
@@ -477,13 +464,17 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
477 struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; 464 struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
478 struct inet_peer_base *base = family_to_base(daddr->family); 465 struct inet_peer_base *base = family_to_base(daddr->family);
479 struct inet_peer *p; 466 struct inet_peer *p;
467 unsigned int sequence;
468 int invalidated;
480 469
481 /* Look up for the address quickly, lockless. 470 /* Look up for the address quickly, lockless.
482 * Because of a concurrent writer, we might not find an existing entry. 471 * Because of a concurrent writer, we might not find an existing entry.
483 */ 472 */
484 rcu_read_lock_bh(); 473 rcu_read_lock();
485 p = lookup_rcu_bh(daddr, base); 474 sequence = read_seqbegin(&base->lock);
486 rcu_read_unlock_bh(); 475 p = lookup_rcu(daddr, base);
476 invalidated = read_seqretry(&base->lock, sequence);
477 rcu_read_unlock();
487 478
488 if (p) { 479 if (p) {
489 /* The existing node has been found. 480 /* The existing node has been found.
@@ -493,14 +484,18 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
493 return p; 484 return p;
494 } 485 }
495 486
487 /* If no writer did a change during our lookup, we can return early. */
488 if (!create && !invalidated)
489 return NULL;
490
496 /* retry an exact lookup, taking the lock before. 491 /* retry an exact lookup, taking the lock before.
497 * At least, nodes should be hot in our cache. 492 * At least, nodes should be hot in our cache.
498 */ 493 */
499 spin_lock_bh(&base->lock); 494 write_seqlock_bh(&base->lock);
500 p = lookup(daddr, stack, base); 495 p = lookup(daddr, stack, base);
501 if (p != peer_avl_empty) { 496 if (p != peer_avl_empty) {
502 atomic_inc(&p->refcnt); 497 atomic_inc(&p->refcnt);
503 spin_unlock_bh(&base->lock); 498 write_sequnlock_bh(&base->lock);
504 /* Remove the entry from unused list if it was there. */ 499 /* Remove the entry from unused list if it was there. */
505 unlink_from_unused(p); 500 unlink_from_unused(p);
506 return p; 501 return p;
@@ -510,8 +505,14 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
510 p->daddr = *daddr; 505 p->daddr = *daddr;
511 atomic_set(&p->refcnt, 1); 506 atomic_set(&p->refcnt, 1);
512 atomic_set(&p->rid, 0); 507 atomic_set(&p->rid, 0);
513 atomic_set(&p->ip_id_count, secure_ip_id(daddr->a4)); 508 atomic_set(&p->ip_id_count, secure_ip_id(daddr->addr.a4));
514 p->tcp_ts_stamp = 0; 509 p->tcp_ts_stamp = 0;
510 p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
511 p->rate_tokens = 0;
512 p->rate_last = 0;
513 p->pmtu_expires = 0;
514 p->pmtu_orig = 0;
515 memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
515 INIT_LIST_HEAD(&p->unused); 516 INIT_LIST_HEAD(&p->unused);
516 517
517 518
@@ -519,11 +520,11 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
519 link_to_pool(p, base); 520 link_to_pool(p, base);
520 base->total++; 521 base->total++;
521 } 522 }
522 spin_unlock_bh(&base->lock); 523 write_sequnlock_bh(&base->lock);
523 524
524 if (base->total >= inet_peer_threshold) 525 if (base->total >= inet_peer_threshold)
525 /* Remove one less-recently-used entry. */ 526 /* Remove one less-recently-used entry. */
526 cleanup_once(0); 527 cleanup_once(0, stack);
527 528
528 return p; 529 return p;
529} 530}
@@ -539,6 +540,7 @@ static void peer_check_expire(unsigned long dummy)
539{ 540{
540 unsigned long now = jiffies; 541 unsigned long now = jiffies;
541 int ttl, total; 542 int ttl, total;
543 struct inet_peer __rcu **stack[PEER_MAXDEPTH];
542 544
543 total = compute_total(); 545 total = compute_total();
544 if (total >= inet_peer_threshold) 546 if (total >= inet_peer_threshold)
@@ -547,7 +549,7 @@ static void peer_check_expire(unsigned long dummy)
547 ttl = inet_peer_maxttl 549 ttl = inet_peer_maxttl
548 - (inet_peer_maxttl - inet_peer_minttl) / HZ * 550 - (inet_peer_maxttl - inet_peer_minttl) / HZ *
549 total / inet_peer_threshold * HZ; 551 total / inet_peer_threshold * HZ;
550 while (!cleanup_once(ttl)) { 552 while (!cleanup_once(ttl, stack)) {
551 if (jiffies != now) 553 if (jiffies != now)
552 break; 554 break;
553 } 555 }
@@ -579,3 +581,44 @@ void inet_putpeer(struct inet_peer *p)
579 local_bh_enable(); 581 local_bh_enable();
580} 582}
581EXPORT_SYMBOL_GPL(inet_putpeer); 583EXPORT_SYMBOL_GPL(inet_putpeer);
584
585/*
586 * Check transmit rate limitation for given message.
587 * The rate information is held in the inet_peer entries now.
588 * This function is generic and could be used for other purposes
589 * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
590 *
591 * Note that the same inet_peer fields are modified by functions in
592 * route.c too, but these work for packet destinations while xrlim_allow
593 * works for icmp destinations. This means the rate limiting information
594 * for one "ip object" is shared - and these ICMPs are twice limited:
595 * by source and by destination.
596 *
597 * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
598 * SHOULD allow setting of rate limits
599 *
600 * Shared between ICMPv4 and ICMPv6.
601 */
602#define XRLIM_BURST_FACTOR 6
603bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
604{
605 unsigned long now, token;
606 bool rc = false;
607
608 if (!peer)
609 return true;
610
611 token = peer->rate_tokens;
612 now = jiffies;
613 token += now - peer->rate_last;
614 peer->rate_last = now;
615 if (token > XRLIM_BURST_FACTOR * timeout)
616 token = XRLIM_BURST_FACTOR * timeout;
617 if (token >= timeout) {
618 token -= timeout;
619 rc = true;
620 }
621 peer->rate_tokens = token;
622 return rc;
623}
624EXPORT_SYMBOL(inet_peer_xrlim_allow);