aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2012-07-19 03:34:03 -0400
committerDavid S. Miller <davem@davemloft.net>2012-07-19 13:35:30 -0400
commitbe9f4a44e7d41cee50ddb5f038fc2391cbbb4046 (patch)
tree184e45a62fa0b4d15961427c0e8d5a496f0617a5
parentaee06da6726d4981c51928c2d6d1e2cabeec7a10 (diff)
ipv4: tcp: remove per net tcp_sock
tcp_v4_send_reset() and tcp_v4_send_ack() use a single socket per network namespace. This leads to bad behavior on multiqueue NICS, because many cpus contend for the socket lock and once socket lock is acquired, extra false sharing on various socket fields slow down the operations. To better resist to attacks, we use a percpu socket. Each cpu can run without contention, using appropriate memory (local node) Additional features : 1) We also mirror the queue_mapping of the incoming skb, so that answers use the same queue if possible. 2) Setting SOCK_USE_WRITE_QUEUE socket flag speedup sock_wfree() 3) We now limit the number of in-flight RST/ACK [1] packets per cpu, instead of per namespace, and we honor the sysctl_wmem_default limit dynamically. (Prior to this patch, sysctl_wmem_default value was copied at boot time, so any further change would not affect tcp_sock limit) [1] These packets are only generated when no socket was matched for the incoming packet. Reported-by: Bill Sommerfeld <wsommerfeld@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/ip.h2
-rw-r--r--include/net/netns/ipv4.h1
-rw-r--r--net/ipv4/ip_output.c50
-rw-r--r--net/ipv4/tcp_ipv4.c8
4 files changed, 36 insertions, 25 deletions
diff --git a/include/net/ip.h b/include/net/ip.h
index ec5cfde85e9..bd5e444a19c 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -158,7 +158,7 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg)
158 return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0; 158 return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0;
159} 159}
160 160
161void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, 161void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
162 __be32 saddr, const struct ip_reply_arg *arg, 162 __be32 saddr, const struct ip_reply_arg *arg,
163 unsigned int len); 163 unsigned int len);
164 164
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 2e089a99d60..d909c7fc3da 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -38,7 +38,6 @@ struct netns_ipv4 {
38 struct sock *fibnl; 38 struct sock *fibnl;
39 39
40 struct sock **icmp_sk; 40 struct sock **icmp_sk;
41 struct sock *tcp_sock;
42 struct inet_peer_base *peers; 41 struct inet_peer_base *peers;
43 struct tcpm_hash_bucket *tcp_metrics_hash; 42 struct tcpm_hash_bucket *tcp_metrics_hash;
44 unsigned int tcp_metrics_hash_mask; 43 unsigned int tcp_metrics_hash_mask;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index cc52679790b..c528f841ca4 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1463,20 +1463,33 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1463 1463
1464/* 1464/*
1465 * Generic function to send a packet as reply to another packet. 1465 * Generic function to send a packet as reply to another packet.
1466 * Used to send TCP resets so far. 1466 * Used to send some TCP resets/acks so far.
1467 * 1467 *
1468 * Should run single threaded per socket because it uses the sock 1468 * Use a fake percpu inet socket to avoid false sharing and contention.
1469 * structure to pass arguments.
1470 */ 1469 */
1471void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr, 1470static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = {
1471 .sk = {
1472 .__sk_common = {
1473 .skc_refcnt = ATOMIC_INIT(1),
1474 },
1475 .sk_wmem_alloc = ATOMIC_INIT(1),
1476 .sk_allocation = GFP_ATOMIC,
1477 .sk_flags = (1UL << SOCK_USE_WRITE_QUEUE),
1478 },
1479 .pmtudisc = IP_PMTUDISC_WANT,
1480};
1481
1482void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
1472 __be32 saddr, const struct ip_reply_arg *arg, 1483 __be32 saddr, const struct ip_reply_arg *arg,
1473 unsigned int len) 1484 unsigned int len)
1474{ 1485{
1475 struct inet_sock *inet = inet_sk(sk);
1476 struct ip_options_data replyopts; 1486 struct ip_options_data replyopts;
1477 struct ipcm_cookie ipc; 1487 struct ipcm_cookie ipc;
1478 struct flowi4 fl4; 1488 struct flowi4 fl4;
1479 struct rtable *rt = skb_rtable(skb); 1489 struct rtable *rt = skb_rtable(skb);
1490 struct sk_buff *nskb;
1491 struct sock *sk;
1492 struct inet_sock *inet;
1480 1493
1481 if (ip_options_echo(&replyopts.opt.opt, skb)) 1494 if (ip_options_echo(&replyopts.opt.opt, skb))
1482 return; 1495 return;
@@ -1494,38 +1507,39 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1494 1507
1495 flowi4_init_output(&fl4, arg->bound_dev_if, 0, 1508 flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1496 RT_TOS(arg->tos), 1509 RT_TOS(arg->tos),
1497 RT_SCOPE_UNIVERSE, sk->sk_protocol, 1510 RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
1498 ip_reply_arg_flowi_flags(arg), 1511 ip_reply_arg_flowi_flags(arg),
1499 daddr, saddr, 1512 daddr, saddr,
1500 tcp_hdr(skb)->source, tcp_hdr(skb)->dest); 1513 tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1501 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); 1514 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1502 rt = ip_route_output_key(sock_net(sk), &fl4); 1515 rt = ip_route_output_key(net, &fl4);
1503 if (IS_ERR(rt)) 1516 if (IS_ERR(rt))
1504 return; 1517 return;
1505 1518
1506 /* And let IP do all the hard work. 1519 inet = &get_cpu_var(unicast_sock);
1507 1520
1508 This chunk is not reenterable, hence spinlock.
1509 Note that it uses the fact, that this function is called
1510 with locally disabled BH and that sk cannot be already spinlocked.
1511 */
1512 bh_lock_sock(sk);
1513 inet->tos = arg->tos; 1521 inet->tos = arg->tos;
1522 sk = &inet->sk;
1514 sk->sk_priority = skb->priority; 1523 sk->sk_priority = skb->priority;
1515 sk->sk_protocol = ip_hdr(skb)->protocol; 1524 sk->sk_protocol = ip_hdr(skb)->protocol;
1516 sk->sk_bound_dev_if = arg->bound_dev_if; 1525 sk->sk_bound_dev_if = arg->bound_dev_if;
1526 sock_net_set(sk, net);
1527 __skb_queue_head_init(&sk->sk_write_queue);
1528 sk->sk_sndbuf = sysctl_wmem_default;
1517 ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, 1529 ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1518 &ipc, &rt, MSG_DONTWAIT); 1530 &ipc, &rt, MSG_DONTWAIT);
1519 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { 1531 nskb = skb_peek(&sk->sk_write_queue);
1532 if (nskb) {
1520 if (arg->csumoffset >= 0) 1533 if (arg->csumoffset >= 0)
1521 *((__sum16 *)skb_transport_header(skb) + 1534 *((__sum16 *)skb_transport_header(nskb) +
1522 arg->csumoffset) = csum_fold(csum_add(skb->csum, 1535 arg->csumoffset) = csum_fold(csum_add(nskb->csum,
1523 arg->csum)); 1536 arg->csum));
1524 skb->ip_summed = CHECKSUM_NONE; 1537 nskb->ip_summed = CHECKSUM_NONE;
1538 skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
1525 ip_push_pending_frames(sk, &fl4); 1539 ip_push_pending_frames(sk, &fl4);
1526 } 1540 }
1527 1541
1528 bh_unlock_sock(sk); 1542 put_cpu_var(unicast_sock);
1529 1543
1530 ip_rt_put(rt); 1544 ip_rt_put(rt);
1531} 1545}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d9caf5c07aa..d7d2fa50f07 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -688,7 +688,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
688 688
689 net = dev_net(skb_dst(skb)->dev); 689 net = dev_net(skb_dst(skb)->dev);
690 arg.tos = ip_hdr(skb)->tos; 690 arg.tos = ip_hdr(skb)->tos;
691 ip_send_unicast_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, 691 ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
692 ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); 692 ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
693 693
694 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); 694 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
@@ -771,7 +771,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
771 if (oif) 771 if (oif)
772 arg.bound_dev_if = oif; 772 arg.bound_dev_if = oif;
773 arg.tos = tos; 773 arg.tos = tos;
774 ip_send_unicast_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, 774 ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
775 ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); 775 ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
776 776
777 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); 777 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
@@ -2624,13 +2624,11 @@ EXPORT_SYMBOL(tcp_prot);
2624 2624
2625static int __net_init tcp_sk_init(struct net *net) 2625static int __net_init tcp_sk_init(struct net *net)
2626{ 2626{
2627 return inet_ctl_sock_create(&net->ipv4.tcp_sock, 2627 return 0;
2628 PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2629} 2628}
2630 2629
2631static void __net_exit tcp_sk_exit(struct net *net) 2630static void __net_exit tcp_sk_exit(struct net *net)
2632{ 2631{
2633 inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2634} 2632}
2635 2633
2636static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 2634static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)