aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2014-06-02 08:26:03 -0400
committerDavid S. Miller <davem@davemloft.net>2014-06-02 14:00:41 -0400
commit73f156a6e8c1074ac6327e0abd1169e95eb66463 (patch)
tree2c8b222f21784e738c397ba95dee70a8f256ea64 /net/ipv4
parente067ee336a9d3f038ffa9699c59f2abec3376bf7 (diff)
inetpeer: get rid of ip_id_count
Ideally, we would need to generate IP ID using a per destination IP generator. linux kernels used inet_peer cache for this purpose, but this had a huge cost on servers disabling MTU discovery. 1) each inet_peer struct consumes 192 bytes 2) inetpeer cache uses a binary tree of inet_peer structs, with a nominal size of ~66000 elements under load. 3) lookups in this tree are hitting a lot of cache lines, as tree depth is about 20. 4) If server deals with many tcp flows, we have a high probability of not finding the inet_peer, allocating a fresh one, inserting it in the tree with same initial ip_id_count, (cf secure_ip_id()) 5) We garbage collect inet_peer aggressively. IP ID generation do not have to be 'perfect' Goal is trying to avoid duplicates in a short period of time, so that reassembly units have a chance to complete reassembly of fragments belonging to one message before receiving other fragments with a recycled ID. We simply use an array of generators, and a Jenkin hash using the dst IP as a key. ipv6_select_ident() is put back into net/ipv6/ip6_output.c where it belongs (it is only used from this file) secure_ip_id() and secure_ipv6_id() no longer are needed. Rename ip_select_ident_more() to ip_select_ident_segs() to avoid unnecessary decrement/increment of the number of segments. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/igmp.c4
-rw-r--r--net/ipv4/inetpeer.c18
-rw-r--r--net/ipv4/ip_output.c7
-rw-r--r--net/ipv4/ip_tunnel_core.c2
-rw-r--r--net/ipv4/ipmr.c2
-rw-r--r--net/ipv4/raw.c2
-rw-r--r--net/ipv4/route.c45
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c2
8 files changed, 25 insertions, 57 deletions
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 17d34e3c2ac3..6748d420f714 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -369,7 +369,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
369 pip->saddr = fl4.saddr; 369 pip->saddr = fl4.saddr;
370 pip->protocol = IPPROTO_IGMP; 370 pip->protocol = IPPROTO_IGMP;
371 pip->tot_len = 0; /* filled in later */ 371 pip->tot_len = 0; /* filled in later */
372 ip_select_ident(skb, &rt->dst, NULL); 372 ip_select_ident(skb, NULL);
373 ((u8 *)&pip[1])[0] = IPOPT_RA; 373 ((u8 *)&pip[1])[0] = IPOPT_RA;
374 ((u8 *)&pip[1])[1] = 4; 374 ((u8 *)&pip[1])[1] = 4;
375 ((u8 *)&pip[1])[2] = 0; 375 ((u8 *)&pip[1])[2] = 0;
@@ -714,7 +714,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
714 iph->daddr = dst; 714 iph->daddr = dst;
715 iph->saddr = fl4.saddr; 715 iph->saddr = fl4.saddr;
716 iph->protocol = IPPROTO_IGMP; 716 iph->protocol = IPPROTO_IGMP;
717 ip_select_ident(skb, &rt->dst, NULL); 717 ip_select_ident(skb, NULL);
718 ((u8 *)&iph[1])[0] = IPOPT_RA; 718 ((u8 *)&iph[1])[0] = IPOPT_RA;
719 ((u8 *)&iph[1])[1] = 4; 719 ((u8 *)&iph[1])[1] = 4;
720 ((u8 *)&iph[1])[2] = 0; 720 ((u8 *)&iph[1])[2] = 0;
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index c98cf141f4ed..4ced1b9a97f0 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -26,20 +26,7 @@
26 * Theory of operations. 26 * Theory of operations.
27 * We keep one entry for each peer IP address. The nodes contains long-living 27 * We keep one entry for each peer IP address. The nodes contains long-living
28 * information about the peer which doesn't depend on routes. 28 * information about the peer which doesn't depend on routes.
29 * At this moment this information consists only of ID field for the next
30 * outgoing IP packet. This field is incremented with each packet as encoded
31 * in inet_getid() function (include/net/inetpeer.h).
32 * At the moment of writing this notes identifier of IP packets is generated
33 * to be unpredictable using this code only for packets subjected
34 * (actually or potentially) to defragmentation. I.e. DF packets less than
35 * PMTU in size when local fragmentation is disabled use a constant ID and do
36 * not use this code (see ip_select_ident() in include/net/ip.h).
37 * 29 *
38 * Route cache entries hold references to our nodes.
39 * New cache entries get references via lookup by destination IP address in
40 * the avl tree. The reference is grabbed only when it's needed i.e. only
41 * when we try to output IP packet which needs an unpredictable ID (see
42 * __ip_select_ident() in net/ipv4/route.c).
43 * Nodes are removed only when reference counter goes to 0. 30 * Nodes are removed only when reference counter goes to 0.
44 * When it's happened the node may be removed when a sufficient amount of 31 * When it's happened the node may be removed when a sufficient amount of
45 * time has been passed since its last use. The less-recently-used entry can 32 * time has been passed since its last use. The less-recently-used entry can
@@ -62,7 +49,6 @@
62 * refcnt: atomically against modifications on other CPU; 49 * refcnt: atomically against modifications on other CPU;
63 * usually under some other lock to prevent node disappearing 50 * usually under some other lock to prevent node disappearing
64 * daddr: unchangeable 51 * daddr: unchangeable
65 * ip_id_count: atomic value (no lock needed)
66 */ 52 */
67 53
68static struct kmem_cache *peer_cachep __read_mostly; 54static struct kmem_cache *peer_cachep __read_mostly;
@@ -497,10 +483,6 @@ relookup:
497 p->daddr = *daddr; 483 p->daddr = *daddr;
498 atomic_set(&p->refcnt, 1); 484 atomic_set(&p->refcnt, 1);
499 atomic_set(&p->rid, 0); 485 atomic_set(&p->rid, 0);
500 atomic_set(&p->ip_id_count,
501 (daddr->family == AF_INET) ?
502 secure_ip_id(daddr->addr.a4) :
503 secure_ipv6_id(daddr->addr.a6));
504 p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; 486 p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
505 p->rate_tokens = 0; 487 p->rate_tokens = 0;
506 /* 60*HZ is arbitrary, but chosen enough high so that the first 488 /* 60*HZ is arbitrary, but chosen enough high so that the first
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 6e231ab58d65..8d3b6b0e9857 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -148,7 +148,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
148 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); 148 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
149 iph->saddr = saddr; 149 iph->saddr = saddr;
150 iph->protocol = sk->sk_protocol; 150 iph->protocol = sk->sk_protocol;
151 ip_select_ident(skb, &rt->dst, sk); 151 ip_select_ident(skb, sk);
152 152
153 if (opt && opt->opt.optlen) { 153 if (opt && opt->opt.optlen) {
154 iph->ihl += opt->opt.optlen>>2; 154 iph->ihl += opt->opt.optlen>>2;
@@ -430,8 +430,7 @@ packet_routed:
430 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); 430 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
431 } 431 }
432 432
433 ip_select_ident_more(skb, &rt->dst, sk, 433 ip_select_ident_segs(skb, sk, skb_shinfo(skb)->gso_segs ?: 1);
434 (skb_shinfo(skb)->gso_segs ?: 1) - 1);
435 434
436 /* TODO : should we use skb->sk here instead of sk ? */ 435 /* TODO : should we use skb->sk here instead of sk ? */
437 skb->priority = sk->sk_priority; 436 skb->priority = sk->sk_priority;
@@ -1379,7 +1378,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
1379 iph->ttl = ttl; 1378 iph->ttl = ttl;
1380 iph->protocol = sk->sk_protocol; 1379 iph->protocol = sk->sk_protocol;
1381 ip_copy_addrs(iph, fl4); 1380 ip_copy_addrs(iph, fl4);
1382 ip_select_ident(skb, &rt->dst, sk); 1381 ip_select_ident(skb, sk);
1383 1382
1384 if (opt) { 1383 if (opt) {
1385 iph->ihl += opt->optlen>>2; 1384 iph->ihl += opt->optlen>>2;
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index bcf206c79005..847e69cbff7e 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -74,7 +74,7 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
74 iph->daddr = dst; 74 iph->daddr = dst;
75 iph->saddr = src; 75 iph->saddr = src;
76 iph->ttl = ttl; 76 iph->ttl = ttl;
77 __ip_select_ident(iph, &rt->dst, (skb_shinfo(skb)->gso_segs ?: 1) - 1); 77 __ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1);
78 78
79 err = ip_local_out_sk(sk, skb); 79 err = ip_local_out_sk(sk, skb);
80 if (unlikely(net_xmit_eval(err))) 80 if (unlikely(net_xmit_eval(err)))
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 2bc9cc47f246..65bcaa789043 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1663,7 +1663,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1663 iph->protocol = IPPROTO_IPIP; 1663 iph->protocol = IPPROTO_IPIP;
1664 iph->ihl = 5; 1664 iph->ihl = 5;
1665 iph->tot_len = htons(skb->len); 1665 iph->tot_len = htons(skb->len);
1666 ip_select_ident(skb, skb_dst(skb), NULL); 1666 ip_select_ident(skb, NULL);
1667 ip_send_check(iph); 1667 ip_send_check(iph);
1668 1668
1669 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 1669 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index a9dbe58bdfe7..2c65160565e1 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -389,7 +389,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
389 iph->check = 0; 389 iph->check = 0;
390 iph->tot_len = htons(length); 390 iph->tot_len = htons(length);
391 if (!iph->id) 391 if (!iph->id)
392 ip_select_ident(skb, &rt->dst, NULL); 392 ip_select_ident(skb, NULL);
393 393
394 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); 394 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
395 } 395 }
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 4154eb76b0ad..082239ffe34a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -89,6 +89,7 @@
89#include <linux/rcupdate.h> 89#include <linux/rcupdate.h>
90#include <linux/times.h> 90#include <linux/times.h>
91#include <linux/slab.h> 91#include <linux/slab.h>
92#include <linux/jhash.h>
92#include <net/dst.h> 93#include <net/dst.h>
93#include <net/net_namespace.h> 94#include <net/net_namespace.h>
94#include <net/protocol.h> 95#include <net/protocol.h>
@@ -456,39 +457,19 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
456 return neigh_create(&arp_tbl, pkey, dev); 457 return neigh_create(&arp_tbl, pkey, dev);
457} 458}
458 459
459/* 460atomic_t *ip_idents __read_mostly;
460 * Peer allocation may fail only in serious out-of-memory conditions. However 461EXPORT_SYMBOL(ip_idents);
461 * we still can generate some output.
462 * Random ID selection looks a bit dangerous because we have no chances to
463 * select ID being unique in a reasonable period of time.
464 * But broken packet identifier may be better than no packet at all.
465 */
466static void ip_select_fb_ident(struct iphdr *iph)
467{
468 static DEFINE_SPINLOCK(ip_fb_id_lock);
469 static u32 ip_fallback_id;
470 u32 salt;
471 462
472 spin_lock_bh(&ip_fb_id_lock); 463void __ip_select_ident(struct iphdr *iph, int segs)
473 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
474 iph->id = htons(salt & 0xFFFF);
475 ip_fallback_id = salt;
476 spin_unlock_bh(&ip_fb_id_lock);
477}
478
479void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
480{ 464{
481 struct net *net = dev_net(dst->dev); 465 static u32 ip_idents_hashrnd __read_mostly;
482 struct inet_peer *peer; 466 u32 hash, id;
483 467
484 peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1); 468 net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
485 if (peer) {
486 iph->id = htons(inet_getid(peer, more));
487 inet_putpeer(peer);
488 return;
489 }
490 469
491 ip_select_fb_ident(iph); 470 hash = jhash_1word((__force u32)iph->daddr, ip_idents_hashrnd);
471 id = ip_idents_reserve(hash, segs);
472 iph->id = htons(id);
492} 473}
493EXPORT_SYMBOL(__ip_select_ident); 474EXPORT_SYMBOL(__ip_select_ident);
494 475
@@ -2711,6 +2692,12 @@ int __init ip_rt_init(void)
2711{ 2692{
2712 int rc = 0; 2693 int rc = 0;
2713 2694
2695 ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2696 if (!ip_idents)
2697 panic("IP: failed to allocate ip_idents\n");
2698
2699 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2700
2714#ifdef CONFIG_IP_ROUTE_CLASSID 2701#ifdef CONFIG_IP_ROUTE_CLASSID
2715 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); 2702 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2716 if (!ip_rt_acct) 2703 if (!ip_rt_acct)
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 05f2b484954f..91771a7c802f 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -58,12 +58,12 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
58 58
59 top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? 59 top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
60 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF)); 60 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF));
61 ip_select_ident(skb, dst->child, NULL);
62 61
63 top_iph->ttl = ip4_dst_hoplimit(dst->child); 62 top_iph->ttl = ip4_dst_hoplimit(dst->child);
64 63
65 top_iph->saddr = x->props.saddr.a4; 64 top_iph->saddr = x->props.saddr.a4;
66 top_iph->daddr = x->id.daddr.a4; 65 top_iph->daddr = x->id.daddr.a4;
66 ip_select_ident(skb, NULL);
67 67
68 return 0; 68 return 0;
69} 69}