aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2015-01-30 00:35:05 -0500
committerDavid S. Miller <davem@davemloft.net>2015-02-02 02:06:19 -0500
commitbdbbb8527b6f6a358dbcb70dac247034d665b8e4 (patch)
treed3c764600d9d7a18956943fcb5c0de8f2e0a6c43
parent0d32ef8cef9aa8f375e128f78b77caceaa7e8da0 (diff)
ipv4: tcp: get rid of ugly unicast_sock
In commit be9f4a44e7d41 ("ipv4: tcp: remove per net tcp_sock") I tried to address contention on a socket lock, but the solution I chose was horrible : commit 3a7c384ffd57e ("ipv4: tcp: unicast_sock should not land outside of TCP stack") addressed a selinux regression. commit 0980e56e506b ("ipv4: tcp: set unicast_sock uc_ttl to -1") took care of another regression. commit b5ec8eeac46 ("ipv4: fix ip_send_skb()") fixed another regression. commit 811230cd85 ("tcp: ipv4: initialize unicast_sock sk_pacing_rate") was another shot in the dark. Really, just use a proper socket per cpu, and remove the skb_orphan() call, to re-enable flow control. This solves a serious problem with FQ packet scheduler when used in hostile environments, as we do not want to allocate a flow structure for every RST packet sent in response to a spoofed packet. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/ip.h2
-rw-r--r--include/net/netns/ipv4.h1
-rw-r--r--net/ipv4/ip_output.c30
-rw-r--r--net/ipv4/tcp_ipv4.c37
4 files changed, 37 insertions, 33 deletions
diff --git a/include/net/ip.h b/include/net/ip.h
index f7cbd703d15d..09cf5aebb283 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -181,7 +181,7 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg)
181 return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0; 181 return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0;
182} 182}
183 183
184void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, 184void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
185 const struct ip_options *sopt, 185 const struct ip_options *sopt,
186 __be32 daddr, __be32 saddr, 186 __be32 daddr, __be32 saddr,
187 const struct ip_reply_arg *arg, 187 const struct ip_reply_arg *arg,
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 24945cefc4fd..0ffef1a38efc 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -52,6 +52,7 @@ struct netns_ipv4 {
52 struct inet_peer_base *peers; 52 struct inet_peer_base *peers;
53 struct tcpm_hash_bucket *tcp_metrics_hash; 53 struct tcpm_hash_bucket *tcp_metrics_hash;
54 unsigned int tcp_metrics_hash_log; 54 unsigned int tcp_metrics_hash_log;
55 struct sock * __percpu *tcp_sk;
55 struct netns_frags frags; 56 struct netns_frags frags;
56#ifdef CONFIG_NETFILTER 57#ifdef CONFIG_NETFILTER
57 struct xt_table *iptable_filter; 58 struct xt_table *iptable_filter;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 38a20a9cca1a..c373c0708d97 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1506,24 +1506,8 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1506/* 1506/*
1507 * Generic function to send a packet as reply to another packet. 1507 * Generic function to send a packet as reply to another packet.
1508 * Used to send some TCP resets/acks so far. 1508 * Used to send some TCP resets/acks so far.
1509 *
1510 * Use a fake percpu inet socket to avoid false sharing and contention.
1511 */ 1509 */
1512static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = { 1510void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
1513 .sk = {
1514 .__sk_common = {
1515 .skc_refcnt = ATOMIC_INIT(1),
1516 },
1517 .sk_wmem_alloc = ATOMIC_INIT(1),
1518 .sk_allocation = GFP_ATOMIC,
1519 .sk_flags = (1UL << SOCK_USE_WRITE_QUEUE),
1520 .sk_pacing_rate = ~0U,
1521 },
1522 .pmtudisc = IP_PMTUDISC_WANT,
1523 .uc_ttl = -1,
1524};
1525
1526void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
1527 const struct ip_options *sopt, 1511 const struct ip_options *sopt,
1528 __be32 daddr, __be32 saddr, 1512 __be32 daddr, __be32 saddr,
1529 const struct ip_reply_arg *arg, 1513 const struct ip_reply_arg *arg,
@@ -1533,9 +1517,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
1533 struct ipcm_cookie ipc; 1517 struct ipcm_cookie ipc;
1534 struct flowi4 fl4; 1518 struct flowi4 fl4;
1535 struct rtable *rt = skb_rtable(skb); 1519 struct rtable *rt = skb_rtable(skb);
1520 struct net *net = sock_net(sk);
1536 struct sk_buff *nskb; 1521 struct sk_buff *nskb;
1537 struct sock *sk;
1538 struct inet_sock *inet;
1539 int err; 1522 int err;
1540 1523
1541 if (__ip_options_echo(&replyopts.opt.opt, skb, sopt)) 1524 if (__ip_options_echo(&replyopts.opt.opt, skb, sopt))
@@ -1566,15 +1549,11 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
1566 if (IS_ERR(rt)) 1549 if (IS_ERR(rt))
1567 return; 1550 return;
1568 1551
1569 inet = &get_cpu_var(unicast_sock); 1552 inet_sk(sk)->tos = arg->tos;
1570 1553
1571 inet->tos = arg->tos;
1572 sk = &inet->sk;
1573 sk->sk_priority = skb->priority; 1554 sk->sk_priority = skb->priority;
1574 sk->sk_protocol = ip_hdr(skb)->protocol; 1555 sk->sk_protocol = ip_hdr(skb)->protocol;
1575 sk->sk_bound_dev_if = arg->bound_dev_if; 1556 sk->sk_bound_dev_if = arg->bound_dev_if;
1576 sock_net_set(sk, net);
1577 __skb_queue_head_init(&sk->sk_write_queue);
1578 sk->sk_sndbuf = sysctl_wmem_default; 1557 sk->sk_sndbuf = sysctl_wmem_default;
1579 err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, 1558 err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
1580 len, 0, &ipc, &rt, MSG_DONTWAIT); 1559 len, 0, &ipc, &rt, MSG_DONTWAIT);
@@ -1590,13 +1569,10 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
1590 arg->csumoffset) = csum_fold(csum_add(nskb->csum, 1569 arg->csumoffset) = csum_fold(csum_add(nskb->csum,
1591 arg->csum)); 1570 arg->csum));
1592 nskb->ip_summed = CHECKSUM_NONE; 1571 nskb->ip_summed = CHECKSUM_NONE;
1593 skb_orphan(nskb);
1594 skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); 1572 skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
1595 ip_push_pending_frames(sk, &fl4); 1573 ip_push_pending_frames(sk, &fl4);
1596 } 1574 }
1597out: 1575out:
1598 put_cpu_var(unicast_sock);
1599
1600 ip_rt_put(rt); 1576 ip_rt_put(rt);
1601} 1577}
1602 1578
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a3f72d7fc06c..d22f54482bab 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -683,7 +683,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
683 arg.bound_dev_if = sk->sk_bound_dev_if; 683 arg.bound_dev_if = sk->sk_bound_dev_if;
684 684
685 arg.tos = ip_hdr(skb)->tos; 685 arg.tos = ip_hdr(skb)->tos;
686 ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt, 686 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
687 skb, &TCP_SKB_CB(skb)->header.h4.opt,
687 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 688 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
688 &arg, arg.iov[0].iov_len); 689 &arg, arg.iov[0].iov_len);
689 690
@@ -767,7 +768,8 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
767 if (oif) 768 if (oif)
768 arg.bound_dev_if = oif; 769 arg.bound_dev_if = oif;
769 arg.tos = tos; 770 arg.tos = tos;
770 ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt, 771 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
772 skb, &TCP_SKB_CB(skb)->header.h4.opt,
771 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 773 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
772 &arg, arg.iov[0].iov_len); 774 &arg, arg.iov[0].iov_len);
773 775
@@ -2428,14 +2430,39 @@ struct proto tcp_prot = {
2428}; 2430};
2429EXPORT_SYMBOL(tcp_prot); 2431EXPORT_SYMBOL(tcp_prot);
2430 2432
2433static void __net_exit tcp_sk_exit(struct net *net)
2434{
2435 int cpu;
2436
2437 for_each_possible_cpu(cpu)
2438 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2439 free_percpu(net->ipv4.tcp_sk);
2440}
2441
2431static int __net_init tcp_sk_init(struct net *net) 2442static int __net_init tcp_sk_init(struct net *net)
2432{ 2443{
2444 int res, cpu;
2445
2446 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2447 if (!net->ipv4.tcp_sk)
2448 return -ENOMEM;
2449
2450 for_each_possible_cpu(cpu) {
2451 struct sock *sk;
2452
2453 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2454 IPPROTO_TCP, net);
2455 if (res)
2456 goto fail;
2457 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2458 }
2433 net->ipv4.sysctl_tcp_ecn = 2; 2459 net->ipv4.sysctl_tcp_ecn = 2;
2434 return 0; 2460 return 0;
2435}
2436 2461
2437static void __net_exit tcp_sk_exit(struct net *net) 2462fail:
2438{ 2463 tcp_sk_exit(net);
2464
2465 return res;
2439} 2466}
2440 2467
2441static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 2468static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)