diff options
author | Eric Dumazet <edumazet@google.com> | 2014-06-02 08:26:03 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2014-06-02 14:00:41 -0400 |
commit | 73f156a6e8c1074ac6327e0abd1169e95eb66463 (patch) | |
tree | 2c8b222f21784e738c397ba95dee70a8f256ea64 /net/ipv4 | |
parent | e067ee336a9d3f038ffa9699c59f2abec3376bf7 (diff) |
inetpeer: get rid of ip_id_count
Ideally, we would need to generate IP ID using a per destination IP
generator.
linux kernels used inet_peer cache for this purpose, but this had a huge
cost on servers disabling MTU discovery.
1) each inet_peer struct consumes 192 bytes
2) inetpeer cache uses a binary tree of inet_peer structs,
with a nominal size of ~66000 elements under load.
3) lookups in this tree are hitting a lot of cache lines, as tree depth
is about 20.
4) If server deals with many tcp flows, we have a high probability of
not finding the inet_peer, allocating a fresh one, inserting it in
the tree with same initial ip_id_count, (cf secure_ip_id())
5) We garbage collect inet_peer aggressively.
IP ID generation do not have to be 'perfect'
Goal is trying to avoid duplicates in a short period of time,
so that reassembly units have a chance to complete reassembly of
fragments belonging to one message before receiving other fragments
with a recycled ID.
We simply use an array of generators, and a Jenkin hash using the dst IP
as a key.
ipv6_select_ident() is put back into net/ipv6/ip6_output.c where it
belongs (it is only used from this file)
secure_ip_id() and secure_ipv6_id() no longer are needed.
Rename ip_select_ident_more() to ip_select_ident_segs() to avoid
unnecessary decrement/increment of the number of segments.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/igmp.c | 4 | ||||
-rw-r--r-- | net/ipv4/inetpeer.c | 18 | ||||
-rw-r--r-- | net/ipv4/ip_output.c | 7 | ||||
-rw-r--r-- | net/ipv4/ip_tunnel_core.c | 2 | ||||
-rw-r--r-- | net/ipv4/ipmr.c | 2 | ||||
-rw-r--r-- | net/ipv4/raw.c | 2 | ||||
-rw-r--r-- | net/ipv4/route.c | 45 | ||||
-rw-r--r-- | net/ipv4/xfrm4_mode_tunnel.c | 2 |
8 files changed, 25 insertions, 57 deletions
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 17d34e3c2ac3..6748d420f714 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c | |||
@@ -369,7 +369,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) | |||
369 | pip->saddr = fl4.saddr; | 369 | pip->saddr = fl4.saddr; |
370 | pip->protocol = IPPROTO_IGMP; | 370 | pip->protocol = IPPROTO_IGMP; |
371 | pip->tot_len = 0; /* filled in later */ | 371 | pip->tot_len = 0; /* filled in later */ |
372 | ip_select_ident(skb, &rt->dst, NULL); | 372 | ip_select_ident(skb, NULL); |
373 | ((u8 *)&pip[1])[0] = IPOPT_RA; | 373 | ((u8 *)&pip[1])[0] = IPOPT_RA; |
374 | ((u8 *)&pip[1])[1] = 4; | 374 | ((u8 *)&pip[1])[1] = 4; |
375 | ((u8 *)&pip[1])[2] = 0; | 375 | ((u8 *)&pip[1])[2] = 0; |
@@ -714,7 +714,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, | |||
714 | iph->daddr = dst; | 714 | iph->daddr = dst; |
715 | iph->saddr = fl4.saddr; | 715 | iph->saddr = fl4.saddr; |
716 | iph->protocol = IPPROTO_IGMP; | 716 | iph->protocol = IPPROTO_IGMP; |
717 | ip_select_ident(skb, &rt->dst, NULL); | 717 | ip_select_ident(skb, NULL); |
718 | ((u8 *)&iph[1])[0] = IPOPT_RA; | 718 | ((u8 *)&iph[1])[0] = IPOPT_RA; |
719 | ((u8 *)&iph[1])[1] = 4; | 719 | ((u8 *)&iph[1])[1] = 4; |
720 | ((u8 *)&iph[1])[2] = 0; | 720 | ((u8 *)&iph[1])[2] = 0; |
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index c98cf141f4ed..4ced1b9a97f0 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c | |||
@@ -26,20 +26,7 @@ | |||
26 | * Theory of operations. | 26 | * Theory of operations. |
27 | * We keep one entry for each peer IP address. The nodes contains long-living | 27 | * We keep one entry for each peer IP address. The nodes contains long-living |
28 | * information about the peer which doesn't depend on routes. | 28 | * information about the peer which doesn't depend on routes. |
29 | * At this moment this information consists only of ID field for the next | ||
30 | * outgoing IP packet. This field is incremented with each packet as encoded | ||
31 | * in inet_getid() function (include/net/inetpeer.h). | ||
32 | * At the moment of writing this notes identifier of IP packets is generated | ||
33 | * to be unpredictable using this code only for packets subjected | ||
34 | * (actually or potentially) to defragmentation. I.e. DF packets less than | ||
35 | * PMTU in size when local fragmentation is disabled use a constant ID and do | ||
36 | * not use this code (see ip_select_ident() in include/net/ip.h). | ||
37 | * | 29 | * |
38 | * Route cache entries hold references to our nodes. | ||
39 | * New cache entries get references via lookup by destination IP address in | ||
40 | * the avl tree. The reference is grabbed only when it's needed i.e. only | ||
41 | * when we try to output IP packet which needs an unpredictable ID (see | ||
42 | * __ip_select_ident() in net/ipv4/route.c). | ||
43 | * Nodes are removed only when reference counter goes to 0. | 30 | * Nodes are removed only when reference counter goes to 0. |
44 | * When it's happened the node may be removed when a sufficient amount of | 31 | * When it's happened the node may be removed when a sufficient amount of |
45 | * time has been passed since its last use. The less-recently-used entry can | 32 | * time has been passed since its last use. The less-recently-used entry can |
@@ -62,7 +49,6 @@ | |||
62 | * refcnt: atomically against modifications on other CPU; | 49 | * refcnt: atomically against modifications on other CPU; |
63 | * usually under some other lock to prevent node disappearing | 50 | * usually under some other lock to prevent node disappearing |
64 | * daddr: unchangeable | 51 | * daddr: unchangeable |
65 | * ip_id_count: atomic value (no lock needed) | ||
66 | */ | 52 | */ |
67 | 53 | ||
68 | static struct kmem_cache *peer_cachep __read_mostly; | 54 | static struct kmem_cache *peer_cachep __read_mostly; |
@@ -497,10 +483,6 @@ relookup: | |||
497 | p->daddr = *daddr; | 483 | p->daddr = *daddr; |
498 | atomic_set(&p->refcnt, 1); | 484 | atomic_set(&p->refcnt, 1); |
499 | atomic_set(&p->rid, 0); | 485 | atomic_set(&p->rid, 0); |
500 | atomic_set(&p->ip_id_count, | ||
501 | (daddr->family == AF_INET) ? | ||
502 | secure_ip_id(daddr->addr.a4) : | ||
503 | secure_ipv6_id(daddr->addr.a6)); | ||
504 | p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; | 486 | p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; |
505 | p->rate_tokens = 0; | 487 | p->rate_tokens = 0; |
506 | /* 60*HZ is arbitrary, but chosen enough high so that the first | 488 | /* 60*HZ is arbitrary, but chosen enough high so that the first |
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 6e231ab58d65..8d3b6b0e9857 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c | |||
@@ -148,7 +148,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, | |||
148 | iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); | 148 | iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); |
149 | iph->saddr = saddr; | 149 | iph->saddr = saddr; |
150 | iph->protocol = sk->sk_protocol; | 150 | iph->protocol = sk->sk_protocol; |
151 | ip_select_ident(skb, &rt->dst, sk); | 151 | ip_select_ident(skb, sk); |
152 | 152 | ||
153 | if (opt && opt->opt.optlen) { | 153 | if (opt && opt->opt.optlen) { |
154 | iph->ihl += opt->opt.optlen>>2; | 154 | iph->ihl += opt->opt.optlen>>2; |
@@ -430,8 +430,7 @@ packet_routed: | |||
430 | ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); | 430 | ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); |
431 | } | 431 | } |
432 | 432 | ||
433 | ip_select_ident_more(skb, &rt->dst, sk, | 433 | ip_select_ident_segs(skb, sk, skb_shinfo(skb)->gso_segs ?: 1); |
434 | (skb_shinfo(skb)->gso_segs ?: 1) - 1); | ||
435 | 434 | ||
436 | /* TODO : should we use skb->sk here instead of sk ? */ | 435 | /* TODO : should we use skb->sk here instead of sk ? */ |
437 | skb->priority = sk->sk_priority; | 436 | skb->priority = sk->sk_priority; |
@@ -1379,7 +1378,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, | |||
1379 | iph->ttl = ttl; | 1378 | iph->ttl = ttl; |
1380 | iph->protocol = sk->sk_protocol; | 1379 | iph->protocol = sk->sk_protocol; |
1381 | ip_copy_addrs(iph, fl4); | 1380 | ip_copy_addrs(iph, fl4); |
1382 | ip_select_ident(skb, &rt->dst, sk); | 1381 | ip_select_ident(skb, sk); |
1383 | 1382 | ||
1384 | if (opt) { | 1383 | if (opt) { |
1385 | iph->ihl += opt->optlen>>2; | 1384 | iph->ihl += opt->optlen>>2; |
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index bcf206c79005..847e69cbff7e 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c | |||
@@ -74,7 +74,7 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, | |||
74 | iph->daddr = dst; | 74 | iph->daddr = dst; |
75 | iph->saddr = src; | 75 | iph->saddr = src; |
76 | iph->ttl = ttl; | 76 | iph->ttl = ttl; |
77 | __ip_select_ident(iph, &rt->dst, (skb_shinfo(skb)->gso_segs ?: 1) - 1); | 77 | __ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1); |
78 | 78 | ||
79 | err = ip_local_out_sk(sk, skb); | 79 | err = ip_local_out_sk(sk, skb); |
80 | if (unlikely(net_xmit_eval(err))) | 80 | if (unlikely(net_xmit_eval(err))) |
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 2bc9cc47f246..65bcaa789043 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c | |||
@@ -1663,7 +1663,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) | |||
1663 | iph->protocol = IPPROTO_IPIP; | 1663 | iph->protocol = IPPROTO_IPIP; |
1664 | iph->ihl = 5; | 1664 | iph->ihl = 5; |
1665 | iph->tot_len = htons(skb->len); | 1665 | iph->tot_len = htons(skb->len); |
1666 | ip_select_ident(skb, skb_dst(skb), NULL); | 1666 | ip_select_ident(skb, NULL); |
1667 | ip_send_check(iph); | 1667 | ip_send_check(iph); |
1668 | 1668 | ||
1669 | memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); | 1669 | memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); |
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index a9dbe58bdfe7..2c65160565e1 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c | |||
@@ -389,7 +389,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, | |||
389 | iph->check = 0; | 389 | iph->check = 0; |
390 | iph->tot_len = htons(length); | 390 | iph->tot_len = htons(length); |
391 | if (!iph->id) | 391 | if (!iph->id) |
392 | ip_select_ident(skb, &rt->dst, NULL); | 392 | ip_select_ident(skb, NULL); |
393 | 393 | ||
394 | iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); | 394 | iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); |
395 | } | 395 | } |
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 4154eb76b0ad..082239ffe34a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c | |||
@@ -89,6 +89,7 @@ | |||
89 | #include <linux/rcupdate.h> | 89 | #include <linux/rcupdate.h> |
90 | #include <linux/times.h> | 90 | #include <linux/times.h> |
91 | #include <linux/slab.h> | 91 | #include <linux/slab.h> |
92 | #include <linux/jhash.h> | ||
92 | #include <net/dst.h> | 93 | #include <net/dst.h> |
93 | #include <net/net_namespace.h> | 94 | #include <net/net_namespace.h> |
94 | #include <net/protocol.h> | 95 | #include <net/protocol.h> |
@@ -456,39 +457,19 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, | |||
456 | return neigh_create(&arp_tbl, pkey, dev); | 457 | return neigh_create(&arp_tbl, pkey, dev); |
457 | } | 458 | } |
458 | 459 | ||
459 | /* | 460 | atomic_t *ip_idents __read_mostly; |
460 | * Peer allocation may fail only in serious out-of-memory conditions. However | 461 | EXPORT_SYMBOL(ip_idents); |
461 | * we still can generate some output. | ||
462 | * Random ID selection looks a bit dangerous because we have no chances to | ||
463 | * select ID being unique in a reasonable period of time. | ||
464 | * But broken packet identifier may be better than no packet at all. | ||
465 | */ | ||
466 | static void ip_select_fb_ident(struct iphdr *iph) | ||
467 | { | ||
468 | static DEFINE_SPINLOCK(ip_fb_id_lock); | ||
469 | static u32 ip_fallback_id; | ||
470 | u32 salt; | ||
471 | 462 | ||
472 | spin_lock_bh(&ip_fb_id_lock); | 463 | void __ip_select_ident(struct iphdr *iph, int segs) |
473 | salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr); | ||
474 | iph->id = htons(salt & 0xFFFF); | ||
475 | ip_fallback_id = salt; | ||
476 | spin_unlock_bh(&ip_fb_id_lock); | ||
477 | } | ||
478 | |||
479 | void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) | ||
480 | { | 464 | { |
481 | struct net *net = dev_net(dst->dev); | 465 | static u32 ip_idents_hashrnd __read_mostly; |
482 | struct inet_peer *peer; | 466 | u32 hash, id; |
483 | 467 | ||
484 | peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1); | 468 | net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd)); |
485 | if (peer) { | ||
486 | iph->id = htons(inet_getid(peer, more)); | ||
487 | inet_putpeer(peer); | ||
488 | return; | ||
489 | } | ||
490 | 469 | ||
491 | ip_select_fb_ident(iph); | 470 | hash = jhash_1word((__force u32)iph->daddr, ip_idents_hashrnd); |
471 | id = ip_idents_reserve(hash, segs); | ||
472 | iph->id = htons(id); | ||
492 | } | 473 | } |
493 | EXPORT_SYMBOL(__ip_select_ident); | 474 | EXPORT_SYMBOL(__ip_select_ident); |
494 | 475 | ||
@@ -2711,6 +2692,12 @@ int __init ip_rt_init(void) | |||
2711 | { | 2692 | { |
2712 | int rc = 0; | 2693 | int rc = 0; |
2713 | 2694 | ||
2695 | ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL); | ||
2696 | if (!ip_idents) | ||
2697 | panic("IP: failed to allocate ip_idents\n"); | ||
2698 | |||
2699 | prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); | ||
2700 | |||
2714 | #ifdef CONFIG_IP_ROUTE_CLASSID | 2701 | #ifdef CONFIG_IP_ROUTE_CLASSID |
2715 | ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); | 2702 | ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); |
2716 | if (!ip_rt_acct) | 2703 | if (!ip_rt_acct) |
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 05f2b484954f..91771a7c802f 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c | |||
@@ -58,12 +58,12 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) | |||
58 | 58 | ||
59 | top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? | 59 | top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? |
60 | 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); | 60 | 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); |
61 | ip_select_ident(skb, dst->child, NULL); | ||
62 | 61 | ||
63 | top_iph->ttl = ip4_dst_hoplimit(dst->child); | 62 | top_iph->ttl = ip4_dst_hoplimit(dst->child); |
64 | 63 | ||
65 | top_iph->saddr = x->props.saddr.a4; | 64 | top_iph->saddr = x->props.saddr.a4; |
66 | top_iph->daddr = x->id.daddr.a4; | 65 | top_iph->daddr = x->id.daddr.a4; |
66 | ip_select_ident(skb, NULL); | ||
67 | 67 | ||
68 | return 0; | 68 | return 0; |
69 | } | 69 | } |