ipv4: tcp: get rid of ugly unicast_sock

In commit be9f4a44e7d41 ("ipv4: tcp: remove per net tcp_sock") I tried to address contention on a socket lock, but the solution I chose was horrible : commit 3a7c384ffd57e ("ipv4: tcp: unicast_sock should not land outside of TCP stack") addressed a selinux regression. commit 0980e56e506b ("ipv4: tcp: set unicast_sock uc_ttl to -1") took care of another regression. commit b5ec8eeac46 ("ipv4: fix ip_send_skb()") fixed another regression. commit 811230cd85 ("tcp: ipv4: initialize unicast_sock sk_pacing_rate") was another shot in the dark. Really, just use a proper socket per cpu, and remove the skb_orphan() call, to re-enable flow control. This solves a serious problem with FQ packet scheduler when used in hostile environments, as we do not want to allocate a flow structure for every RST packet sent in response to a spoofed packet. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Eric Dumazet <edumazet@google.com> 2015-01-30 00:35:05 -0500
committer: David S. Miller <davem@davemloft.net> 2015-02-02 02:06:19 -0500
commit: bdbbb8527b6f6a358dbcb70dac247034d665b8e4 (patch)
tree: d3c764600d9d7a18956943fcb5c0de8f2e0a6c43 /net/ipv4
parent: 0d32ef8cef9aa8f375e128f78b77caceaa7e8da0 (diff)
2 files changed, 35 insertions, 32 deletions
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 38a20a9cca1a..c373c0708d97 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1506,24 +1506,8 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
 /*
 *      Generic function to send a packet as reply to another packet.
 *      Used to send some TCP resets/acks so far.
- *
- *      Use a fake percpu inet socket to avoid false sharing and contention.
 */
-static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = {
+void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
-        .sk = {
-                .__sk_common = {
-                        .skc_refcnt = ATOMIC_INIT(1),
-                },
-                .sk_wmem_alloc  = ATOMIC_INIT(1),
-                .sk_allocation  = GFP_ATOMIC,
-                .sk_flags       = (1UL << SOCK_USE_WRITE_QUEUE),
-                .sk_pacing_rate = ~0U,
-        },
-        .pmtudisc       = IP_PMTUDISC_WANT,
-        .uc_ttl         = -1,
-};
-void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
                           const struct ip_options *sopt,
                           __be32 daddr, __be32 saddr,
                           const struct ip_reply_arg *arg,
@@ -1533,9 +1517,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
        struct ipcm_cookie ipc;
        struct flowi4 fl4;
        struct rtable *rt = skb_rtable(skb);
+        struct net *net = sock_net(sk);
        struct sk_buff *nskb;
-        struct sock *sk;
-        struct inet_sock *inet;
        int err;
        if (__ip_options_echo(&replyopts.opt.opt, skb, sopt))
@@ -1566,15 +1549,11 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
        if (IS_ERR(rt))
                return;
-        inet = &get_cpu_var(unicast_sock);
+        inet_sk(sk)->tos = arg->tos;
-        inet->tos = arg->tos;
-        sk = &inet->sk;
        sk->sk_priority = skb->priority;
        sk->sk_protocol = ip_hdr(skb)->protocol;
        sk->sk_bound_dev_if = arg->bound_dev_if;
-        sock_net_set(sk, net);
-        __skb_queue_head_init(&sk->sk_write_queue);
        sk->sk_sndbuf = sysctl_wmem_default;
        err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
                             len, 0, &ipc, &rt, MSG_DONTWAIT);
@@ -1590,13 +1569,10 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
                          arg->csumoffset) = csum_fold(csum_add(nskb->csum,
                                                                arg->csum));
                nskb->ip_summed = CHECKSUM_NONE;
-                skb_orphan(nskb);
                skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
                ip_push_pending_frames(sk, &fl4);
        }
 out:
-        put_cpu_var(unicast_sock);
        ip_rt_put(rt);
 }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a3f72d7fc06c..d22f54482bab 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -683,7 +683,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
                arg.bound_dev_if = sk->sk_bound_dev_if;
        arg.tos = ip_hdr(skb)->tos;
-        ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt,
+        ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
+                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
                              &arg, arg.iov[0].iov_len);
@@ -767,7 +768,8 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
        if (oif)
                arg.bound_dev_if = oif;
        arg.tos = tos;
-        ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt,
+        ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
+                              skb, &TCP_SKB_CB(skb)->header.h4.opt,
                              ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
                              &arg, arg.iov[0].iov_len);
@@ -2428,14 +2430,39 @@ struct proto tcp_prot = {
 };
 EXPORT_SYMBOL(tcp_prot);
+static void __net_exit tcp_sk_exit(struct net *net)
+{
+        int cpu;
+        for_each_possible_cpu(cpu)
+                inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
+        free_percpu(net->ipv4.tcp_sk);
+}
 static int __net_init tcp_sk_init(struct net *net)
 {
+        int res, cpu;
+        net->ipv4.tcp_sk = alloc_percpu(struct sock *);
+        if (!net->ipv4.tcp_sk)
+                return -ENOMEM;
+        for_each_possible_cpu(cpu) {
+                struct sock *sk;
+                res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
+                                           IPPROTO_TCP, net);
+                if (res)
+                        goto fail;
+                *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
+        }
        net->ipv4.sysctl_tcp_ecn = 2;
        return 0;
-}
-static void __net_exit tcp_sk_exit(struct net *net)
+fail:
-{
+        tcp_sk_exit(net);
+        return res;
 }
 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
author	Eric Dumazet <edumazet@google.com>	2015-01-30 00:35:05 -0500
committer	David S. Miller <davem@davemloft.net>	2015-02-02 02:06:19 -0500
commit	bdbbb8527b6f6a358dbcb70dac247034d665b8e4 (patch)
tree	d3c764600d9d7a18956943fcb5c0de8f2e0a6c43 /net/ipv4
parent	0d32ef8cef9aa8f375e128f78b77caceaa7e8da0 (diff)