diff options
author | Eric Dumazet <edumazet@google.com> | 2015-01-30 00:35:05 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2015-02-02 02:06:19 -0500 |
commit | bdbbb8527b6f6a358dbcb70dac247034d665b8e4 (patch) | |
tree | d3c764600d9d7a18956943fcb5c0de8f2e0a6c43 | |
parent | 0d32ef8cef9aa8f375e128f78b77caceaa7e8da0 (diff) |
ipv4: tcp: get rid of ugly unicast_sock
In commit be9f4a44e7d41 ("ipv4: tcp: remove per net tcp_sock")
I tried to address contention on a socket lock, but the solution
I chose was horrible :
commit 3a7c384ffd57e ("ipv4: tcp: unicast_sock should not land outside
of TCP stack") addressed a selinux regression.
commit 0980e56e506b ("ipv4: tcp: set unicast_sock uc_ttl to -1")
took care of another regression.
commit b5ec8eeac46 ("ipv4: fix ip_send_skb()") fixed another regression.
commit 811230cd85 ("tcp: ipv4: initialize unicast_sock sk_pacing_rate")
was another shot in the dark.
Really, just use a proper socket per cpu, and remove the skb_orphan()
call, to re-enable flow control.
This solves a serious problem with FQ packet scheduler when used in
hostile environments, as we do not want to allocate a flow structure
for every RST packet sent in response to a spoofed packet.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/net/ip.h | 2 | ||||
-rw-r--r-- | include/net/netns/ipv4.h | 1 | ||||
-rw-r--r-- | net/ipv4/ip_output.c | 30 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 37 |
4 files changed, 37 insertions, 33 deletions
diff --git a/include/net/ip.h b/include/net/ip.h index f7cbd703d15d..09cf5aebb283 100644 --- a/include/net/ip.h +++ b/include/net/ip.h | |||
@@ -181,7 +181,7 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg) | |||
181 | return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0; | 181 | return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0; |
182 | } | 182 | } |
183 | 183 | ||
184 | void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, | 184 | void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, |
185 | const struct ip_options *sopt, | 185 | const struct ip_options *sopt, |
186 | __be32 daddr, __be32 saddr, | 186 | __be32 daddr, __be32 saddr, |
187 | const struct ip_reply_arg *arg, | 187 | const struct ip_reply_arg *arg, |
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 24945cefc4fd..0ffef1a38efc 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h | |||
@@ -52,6 +52,7 @@ struct netns_ipv4 { | |||
52 | struct inet_peer_base *peers; | 52 | struct inet_peer_base *peers; |
53 | struct tcpm_hash_bucket *tcp_metrics_hash; | 53 | struct tcpm_hash_bucket *tcp_metrics_hash; |
54 | unsigned int tcp_metrics_hash_log; | 54 | unsigned int tcp_metrics_hash_log; |
55 | struct sock * __percpu *tcp_sk; | ||
55 | struct netns_frags frags; | 56 | struct netns_frags frags; |
56 | #ifdef CONFIG_NETFILTER | 57 | #ifdef CONFIG_NETFILTER |
57 | struct xt_table *iptable_filter; | 58 | struct xt_table *iptable_filter; |
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 38a20a9cca1a..c373c0708d97 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c | |||
@@ -1506,24 +1506,8 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset, | |||
1506 | /* | 1506 | /* |
1507 | * Generic function to send a packet as reply to another packet. | 1507 | * Generic function to send a packet as reply to another packet. |
1508 | * Used to send some TCP resets/acks so far. | 1508 | * Used to send some TCP resets/acks so far. |
1509 | * | ||
1510 | * Use a fake percpu inet socket to avoid false sharing and contention. | ||
1511 | */ | 1509 | */ |
1512 | static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = { | 1510 | void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, |
1513 | .sk = { | ||
1514 | .__sk_common = { | ||
1515 | .skc_refcnt = ATOMIC_INIT(1), | ||
1516 | }, | ||
1517 | .sk_wmem_alloc = ATOMIC_INIT(1), | ||
1518 | .sk_allocation = GFP_ATOMIC, | ||
1519 | .sk_flags = (1UL << SOCK_USE_WRITE_QUEUE), | ||
1520 | .sk_pacing_rate = ~0U, | ||
1521 | }, | ||
1522 | .pmtudisc = IP_PMTUDISC_WANT, | ||
1523 | .uc_ttl = -1, | ||
1524 | }; | ||
1525 | |||
1526 | void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, | ||
1527 | const struct ip_options *sopt, | 1511 | const struct ip_options *sopt, |
1528 | __be32 daddr, __be32 saddr, | 1512 | __be32 daddr, __be32 saddr, |
1529 | const struct ip_reply_arg *arg, | 1513 | const struct ip_reply_arg *arg, |
@@ -1533,9 +1517,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, | |||
1533 | struct ipcm_cookie ipc; | 1517 | struct ipcm_cookie ipc; |
1534 | struct flowi4 fl4; | 1518 | struct flowi4 fl4; |
1535 | struct rtable *rt = skb_rtable(skb); | 1519 | struct rtable *rt = skb_rtable(skb); |
1520 | struct net *net = sock_net(sk); | ||
1536 | struct sk_buff *nskb; | 1521 | struct sk_buff *nskb; |
1537 | struct sock *sk; | ||
1538 | struct inet_sock *inet; | ||
1539 | int err; | 1522 | int err; |
1540 | 1523 | ||
1541 | if (__ip_options_echo(&replyopts.opt.opt, skb, sopt)) | 1524 | if (__ip_options_echo(&replyopts.opt.opt, skb, sopt)) |
@@ -1566,15 +1549,11 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, | |||
1566 | if (IS_ERR(rt)) | 1549 | if (IS_ERR(rt)) |
1567 | return; | 1550 | return; |
1568 | 1551 | ||
1569 | inet = &get_cpu_var(unicast_sock); | 1552 | inet_sk(sk)->tos = arg->tos; |
1570 | 1553 | ||
1571 | inet->tos = arg->tos; | ||
1572 | sk = &inet->sk; | ||
1573 | sk->sk_priority = skb->priority; | 1554 | sk->sk_priority = skb->priority; |
1574 | sk->sk_protocol = ip_hdr(skb)->protocol; | 1555 | sk->sk_protocol = ip_hdr(skb)->protocol; |
1575 | sk->sk_bound_dev_if = arg->bound_dev_if; | 1556 | sk->sk_bound_dev_if = arg->bound_dev_if; |
1576 | sock_net_set(sk, net); | ||
1577 | __skb_queue_head_init(&sk->sk_write_queue); | ||
1578 | sk->sk_sndbuf = sysctl_wmem_default; | 1557 | sk->sk_sndbuf = sysctl_wmem_default; |
1579 | err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, | 1558 | err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, |
1580 | len, 0, &ipc, &rt, MSG_DONTWAIT); | 1559 | len, 0, &ipc, &rt, MSG_DONTWAIT); |
@@ -1590,13 +1569,10 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, | |||
1590 | arg->csumoffset) = csum_fold(csum_add(nskb->csum, | 1569 | arg->csumoffset) = csum_fold(csum_add(nskb->csum, |
1591 | arg->csum)); | 1570 | arg->csum)); |
1592 | nskb->ip_summed = CHECKSUM_NONE; | 1571 | nskb->ip_summed = CHECKSUM_NONE; |
1593 | skb_orphan(nskb); | ||
1594 | skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); | 1572 | skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); |
1595 | ip_push_pending_frames(sk, &fl4); | 1573 | ip_push_pending_frames(sk, &fl4); |
1596 | } | 1574 | } |
1597 | out: | 1575 | out: |
1598 | put_cpu_var(unicast_sock); | ||
1599 | |||
1600 | ip_rt_put(rt); | 1576 | ip_rt_put(rt); |
1601 | } | 1577 | } |
1602 | 1578 | ||
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a3f72d7fc06c..d22f54482bab 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -683,7 +683,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) | |||
683 | arg.bound_dev_if = sk->sk_bound_dev_if; | 683 | arg.bound_dev_if = sk->sk_bound_dev_if; |
684 | 684 | ||
685 | arg.tos = ip_hdr(skb)->tos; | 685 | arg.tos = ip_hdr(skb)->tos; |
686 | ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt, | 686 | ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), |
687 | skb, &TCP_SKB_CB(skb)->header.h4.opt, | ||
687 | ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, | 688 | ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, |
688 | &arg, arg.iov[0].iov_len); | 689 | &arg, arg.iov[0].iov_len); |
689 | 690 | ||
@@ -767,7 +768,8 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, | |||
767 | if (oif) | 768 | if (oif) |
768 | arg.bound_dev_if = oif; | 769 | arg.bound_dev_if = oif; |
769 | arg.tos = tos; | 770 | arg.tos = tos; |
770 | ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt, | 771 | ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), |
772 | skb, &TCP_SKB_CB(skb)->header.h4.opt, | ||
771 | ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, | 773 | ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, |
772 | &arg, arg.iov[0].iov_len); | 774 | &arg, arg.iov[0].iov_len); |
773 | 775 | ||
@@ -2428,14 +2430,39 @@ struct proto tcp_prot = { | |||
2428 | }; | 2430 | }; |
2429 | EXPORT_SYMBOL(tcp_prot); | 2431 | EXPORT_SYMBOL(tcp_prot); |
2430 | 2432 | ||
2433 | static void __net_exit tcp_sk_exit(struct net *net) | ||
2434 | { | ||
2435 | int cpu; | ||
2436 | |||
2437 | for_each_possible_cpu(cpu) | ||
2438 | inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); | ||
2439 | free_percpu(net->ipv4.tcp_sk); | ||
2440 | } | ||
2441 | |||
2431 | static int __net_init tcp_sk_init(struct net *net) | 2442 | static int __net_init tcp_sk_init(struct net *net) |
2432 | { | 2443 | { |
2444 | int res, cpu; | ||
2445 | |||
2446 | net->ipv4.tcp_sk = alloc_percpu(struct sock *); | ||
2447 | if (!net->ipv4.tcp_sk) | ||
2448 | return -ENOMEM; | ||
2449 | |||
2450 | for_each_possible_cpu(cpu) { | ||
2451 | struct sock *sk; | ||
2452 | |||
2453 | res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, | ||
2454 | IPPROTO_TCP, net); | ||
2455 | if (res) | ||
2456 | goto fail; | ||
2457 | *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; | ||
2458 | } | ||
2433 | net->ipv4.sysctl_tcp_ecn = 2; | 2459 | net->ipv4.sysctl_tcp_ecn = 2; |
2434 | return 0; | 2460 | return 0; |
2435 | } | ||
2436 | 2461 | ||
2437 | static void __net_exit tcp_sk_exit(struct net *net) | 2462 | fail: |
2438 | { | 2463 | tcp_sk_exit(net); |
2464 | |||
2465 | return res; | ||
2439 | } | 2466 | } |
2440 | 2467 | ||
2441 | static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) | 2468 | static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) |