diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-10-18 12:31:37 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-10-18 12:31:37 -0400 |
commit | 2e923b0251932ad4a82cc87ec1443a1f1d17073e (patch) | |
tree | d12032bc9bcfbb8a57659275d1b9b582f23f2ecc /net/ipv4 | |
parent | ffd8221bc348f8c282d1271883dbe629ea8ae289 (diff) | |
parent | f2d9da1a8375cbe53df5b415d059429013a3a79f (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
Pull networking fixes from David Miller:
1) Include fixes for netrom and dsa (Fabian Frederick and Florian
Fainelli)
2) Fix FIXED_PHY support in stmmac, from Giuseppe CAVALLARO.
3) Several SKB use after free fixes (vxlan, openvswitch, vxlan,
ip_tunnel, fou), from Li ROngQing.
4) fec driver PTP support fixes from Luwei Zhou and Nimrod Andy.
5) Use after free in virtio_net, from Michael S Tsirkin.
6) Fix flow mask handling for megaflows in openvswitch, from Pravin B
Shelar.
7) ISDN gigaset and capi bug fixes from Tilman Schmidt.
8) Fix route leak in ip_send_unicast_reply(), from Vasily Averin.
9) Fix two eBPF JIT bugs on x86, from Alexei Starovoitov.
10) TCP_SKB_CB() reorganization caused a few regressions, fixed by Cong
Wang and Eric Dumazet.
11) Don't overwrite end of SKB when parsing malformed sctp ASCONF
chunks, from Daniel Borkmann.
12) Don't call sock_kfree_s() with NULL pointers, this function also has
the side effect of adjusting the socket memory usage. From Cong Wang.
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net: (90 commits)
bna: fix skb->truesize underestimation
net: dsa: add includes for ethtool and phy_fixed definitions
openvswitch: Set flow-key members.
netrom: use linux/uaccess.h
dsa: Fix conversion from host device to mii bus
tipc: fix bug in bundled buffer reception
ipv6: introduce tcp_v6_iif()
sfc: add support for skb->xmit_more
r8152: return -EBUSY for runtime suspend
ipv4: fix a potential use after free in fou.c
ipv4: fix a potential use after free in ip_tunnel_core.c
hyperv: Add handling of IP header with option field in netvsc_set_hash()
openvswitch: Create right mask with disabled megaflows
vxlan: fix a free after use
openvswitch: fix a use after free
ipv4: dst_entry leak in ip_send_unicast_reply()
ipv4: clean up cookie_v4_check()
ipv4: share tcp_v4_save_options() with cookie_v4_check()
ipv4: call __ip_options_echo() in cookie_v4_check()
atm: simplify lanai.c by using module_pci_driver
...
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/fib_semantics.c | 2 | ||||
-rw-r--r-- | net/ipv4/fou.c | 3 | ||||
-rw-r--r-- | net/ipv4/ip_output.c | 12 | ||||
-rw-r--r-- | net/ipv4/ip_tunnel_core.c | 3 | ||||
-rw-r--r-- | net/ipv4/syncookies.c | 16 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 36 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 22 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 34 |
8 files changed, 72 insertions, 56 deletions
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 5b6efb3d2308..f99f41bd15b8 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c | |||
@@ -537,7 +537,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) | |||
537 | return 1; | 537 | return 1; |
538 | 538 | ||
539 | attrlen = rtnh_attrlen(rtnh); | 539 | attrlen = rtnh_attrlen(rtnh); |
540 | if (attrlen < 0) { | 540 | if (attrlen > 0) { |
541 | struct nlattr *nla, *attrs = rtnh_attrs(rtnh); | 541 | struct nlattr *nla, *attrs = rtnh_attrs(rtnh); |
542 | 542 | ||
543 | nla = nla_find(attrs, attrlen, RTA_GATEWAY); | 543 | nla = nla_find(attrs, attrlen, RTA_GATEWAY); |
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index efa70ad44906..32e78924e246 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c | |||
@@ -87,6 +87,9 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) | |||
87 | if (!pskb_may_pull(skb, len)) | 87 | if (!pskb_may_pull(skb, len)) |
88 | goto drop; | 88 | goto drop; |
89 | 89 | ||
90 | uh = udp_hdr(skb); | ||
91 | guehdr = (struct guehdr *)&uh[1]; | ||
92 | |||
90 | if (guehdr->version != 0) | 93 | if (guehdr->version != 0) |
91 | goto drop; | 94 | goto drop; |
92 | 95 | ||
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index e35b71289156..88e5ef2c7f51 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c | |||
@@ -1535,6 +1535,7 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, | |||
1535 | struct sk_buff *nskb; | 1535 | struct sk_buff *nskb; |
1536 | struct sock *sk; | 1536 | struct sock *sk; |
1537 | struct inet_sock *inet; | 1537 | struct inet_sock *inet; |
1538 | int err; | ||
1538 | 1539 | ||
1539 | if (__ip_options_echo(&replyopts.opt.opt, skb, sopt)) | 1540 | if (__ip_options_echo(&replyopts.opt.opt, skb, sopt)) |
1540 | return; | 1541 | return; |
@@ -1574,8 +1575,13 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, | |||
1574 | sock_net_set(sk, net); | 1575 | sock_net_set(sk, net); |
1575 | __skb_queue_head_init(&sk->sk_write_queue); | 1576 | __skb_queue_head_init(&sk->sk_write_queue); |
1576 | sk->sk_sndbuf = sysctl_wmem_default; | 1577 | sk->sk_sndbuf = sysctl_wmem_default; |
1577 | ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, | 1578 | err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, |
1578 | &ipc, &rt, MSG_DONTWAIT); | 1579 | len, 0, &ipc, &rt, MSG_DONTWAIT); |
1580 | if (unlikely(err)) { | ||
1581 | ip_flush_pending_frames(sk); | ||
1582 | goto out; | ||
1583 | } | ||
1584 | |||
1579 | nskb = skb_peek(&sk->sk_write_queue); | 1585 | nskb = skb_peek(&sk->sk_write_queue); |
1580 | if (nskb) { | 1586 | if (nskb) { |
1581 | if (arg->csumoffset >= 0) | 1587 | if (arg->csumoffset >= 0) |
@@ -1587,7 +1593,7 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, | |||
1587 | skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); | 1593 | skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); |
1588 | ip_push_pending_frames(sk, &fl4); | 1594 | ip_push_pending_frames(sk, &fl4); |
1589 | } | 1595 | } |
1590 | 1596 | out: | |
1591 | put_cpu_var(unicast_sock); | 1597 | put_cpu_var(unicast_sock); |
1592 | 1598 | ||
1593 | ip_rt_put(rt); | 1599 | ip_rt_put(rt); |
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index f4c987bb7e94..88c386cf7d85 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c | |||
@@ -91,11 +91,12 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) | |||
91 | skb_pull_rcsum(skb, hdr_len); | 91 | skb_pull_rcsum(skb, hdr_len); |
92 | 92 | ||
93 | if (inner_proto == htons(ETH_P_TEB)) { | 93 | if (inner_proto == htons(ETH_P_TEB)) { |
94 | struct ethhdr *eh = (struct ethhdr *)skb->data; | 94 | struct ethhdr *eh; |
95 | 95 | ||
96 | if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) | 96 | if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) |
97 | return -ENOMEM; | 97 | return -ENOMEM; |
98 | 98 | ||
99 | eh = (struct ethhdr *)skb->data; | ||
99 | if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN)) | 100 | if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN)) |
100 | skb->protocol = eh->h_proto; | 101 | skb->protocol = eh->h_proto; |
101 | else | 102 | else |
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index af660030e3c7..32b98d0207b4 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c | |||
@@ -255,9 +255,9 @@ bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, | |||
255 | } | 255 | } |
256 | EXPORT_SYMBOL(cookie_check_timestamp); | 256 | EXPORT_SYMBOL(cookie_check_timestamp); |
257 | 257 | ||
258 | struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, | 258 | struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) |
259 | struct ip_options *opt) | ||
260 | { | 259 | { |
260 | struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; | ||
261 | struct tcp_options_received tcp_opt; | 261 | struct tcp_options_received tcp_opt; |
262 | struct inet_request_sock *ireq; | 262 | struct inet_request_sock *ireq; |
263 | struct tcp_request_sock *treq; | 263 | struct tcp_request_sock *treq; |
@@ -317,15 +317,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, | |||
317 | /* We throwed the options of the initial SYN away, so we hope | 317 | /* We throwed the options of the initial SYN away, so we hope |
318 | * the ACK carries the same options again (see RFC1122 4.2.3.8) | 318 | * the ACK carries the same options again (see RFC1122 4.2.3.8) |
319 | */ | 319 | */ |
320 | if (opt && opt->optlen) { | 320 | ireq->opt = tcp_v4_save_options(skb); |
321 | int opt_size = sizeof(struct ip_options_rcu) + opt->optlen; | ||
322 | |||
323 | ireq->opt = kmalloc(opt_size, GFP_ATOMIC); | ||
324 | if (ireq->opt != NULL && ip_options_echo(&ireq->opt->opt, skb)) { | ||
325 | kfree(ireq->opt); | ||
326 | ireq->opt = NULL; | ||
327 | } | ||
328 | } | ||
329 | 321 | ||
330 | if (security_inet_conn_request(sk, skb, req)) { | 322 | if (security_inet_conn_request(sk, skb, req)) { |
331 | reqsk_free(req); | 323 | reqsk_free(req); |
@@ -344,7 +336,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, | |||
344 | flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark, | 336 | flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark, |
345 | RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP, | 337 | RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP, |
346 | inet_sk_flowi_flags(sk), | 338 | inet_sk_flowi_flags(sk), |
347 | (opt && opt->srr) ? opt->faddr : ireq->ir_rmt_addr, | 339 | opt->srr ? opt->faddr : ireq->ir_rmt_addr, |
348 | ireq->ir_loc_addr, th->source, th->dest); | 340 | ireq->ir_loc_addr, th->source, th->dest); |
349 | security_req_classify_flow(req, flowi4_to_flowi(&fl4)); | 341 | security_req_classify_flow(req, flowi4_to_flowi(&fl4)); |
350 | rt = ip_route_output_key(sock_net(sk), &fl4); | 342 | rt = ip_route_output_key(sock_net(sk), &fl4); |
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 00a41499d52c..a12b455928e5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -68,6 +68,7 @@ | |||
68 | #include <linux/module.h> | 68 | #include <linux/module.h> |
69 | #include <linux/sysctl.h> | 69 | #include <linux/sysctl.h> |
70 | #include <linux/kernel.h> | 70 | #include <linux/kernel.h> |
71 | #include <linux/prefetch.h> | ||
71 | #include <net/dst.h> | 72 | #include <net/dst.h> |
72 | #include <net/tcp.h> | 73 | #include <net/tcp.h> |
73 | #include <net/inet_common.h> | 74 | #include <net/inet_common.h> |
@@ -3029,6 +3030,21 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) | |||
3029 | return packets_acked; | 3030 | return packets_acked; |
3030 | } | 3031 | } |
3031 | 3032 | ||
3033 | static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, | ||
3034 | u32 prior_snd_una) | ||
3035 | { | ||
3036 | const struct skb_shared_info *shinfo; | ||
3037 | |||
3038 | /* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */ | ||
3039 | if (likely(!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK))) | ||
3040 | return; | ||
3041 | |||
3042 | shinfo = skb_shinfo(skb); | ||
3043 | if ((shinfo->tx_flags & SKBTX_ACK_TSTAMP) && | ||
3044 | between(shinfo->tskey, prior_snd_una, tcp_sk(sk)->snd_una - 1)) | ||
3045 | __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); | ||
3046 | } | ||
3047 | |||
3032 | /* Remove acknowledged frames from the retransmission queue. If our packet | 3048 | /* Remove acknowledged frames from the retransmission queue. If our packet |
3033 | * is before the ack sequence we can discard it as it's confirmed to have | 3049 | * is before the ack sequence we can discard it as it's confirmed to have |
3034 | * arrived at the other end. | 3050 | * arrived at the other end. |
@@ -3052,14 +3068,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3052 | first_ackt.v64 = 0; | 3068 | first_ackt.v64 = 0; |
3053 | 3069 | ||
3054 | while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { | 3070 | while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { |
3055 | struct skb_shared_info *shinfo = skb_shinfo(skb); | ||
3056 | struct tcp_skb_cb *scb = TCP_SKB_CB(skb); | 3071 | struct tcp_skb_cb *scb = TCP_SKB_CB(skb); |
3057 | u8 sacked = scb->sacked; | 3072 | u8 sacked = scb->sacked; |
3058 | u32 acked_pcount; | 3073 | u32 acked_pcount; |
3059 | 3074 | ||
3060 | if (unlikely(shinfo->tx_flags & SKBTX_ACK_TSTAMP) && | 3075 | tcp_ack_tstamp(sk, skb, prior_snd_una); |
3061 | between(shinfo->tskey, prior_snd_una, tp->snd_una - 1)) | ||
3062 | __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); | ||
3063 | 3076 | ||
3064 | /* Determine how many packets and what bytes were acked, tso and else */ | 3077 | /* Determine how many packets and what bytes were acked, tso and else */ |
3065 | if (after(scb->end_seq, tp->snd_una)) { | 3078 | if (after(scb->end_seq, tp->snd_una)) { |
@@ -3073,10 +3086,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3073 | 3086 | ||
3074 | fully_acked = false; | 3087 | fully_acked = false; |
3075 | } else { | 3088 | } else { |
3089 | /* Speedup tcp_unlink_write_queue() and next loop */ | ||
3090 | prefetchw(skb->next); | ||
3076 | acked_pcount = tcp_skb_pcount(skb); | 3091 | acked_pcount = tcp_skb_pcount(skb); |
3077 | } | 3092 | } |
3078 | 3093 | ||
3079 | if (sacked & TCPCB_RETRANS) { | 3094 | if (unlikely(sacked & TCPCB_RETRANS)) { |
3080 | if (sacked & TCPCB_SACKED_RETRANS) | 3095 | if (sacked & TCPCB_SACKED_RETRANS) |
3081 | tp->retrans_out -= acked_pcount; | 3096 | tp->retrans_out -= acked_pcount; |
3082 | flag |= FLAG_RETRANS_DATA_ACKED; | 3097 | flag |= FLAG_RETRANS_DATA_ACKED; |
@@ -3107,7 +3122,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3107 | * connection startup slow start one packet too | 3122 | * connection startup slow start one packet too |
3108 | * quickly. This is severely frowned upon behavior. | 3123 | * quickly. This is severely frowned upon behavior. |
3109 | */ | 3124 | */ |
3110 | if (!(scb->tcp_flags & TCPHDR_SYN)) { | 3125 | if (likely(!(scb->tcp_flags & TCPHDR_SYN))) { |
3111 | flag |= FLAG_DATA_ACKED; | 3126 | flag |= FLAG_DATA_ACKED; |
3112 | } else { | 3127 | } else { |
3113 | flag |= FLAG_SYN_ACKED; | 3128 | flag |= FLAG_SYN_ACKED; |
@@ -3119,9 +3134,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3119 | 3134 | ||
3120 | tcp_unlink_write_queue(skb, sk); | 3135 | tcp_unlink_write_queue(skb, sk); |
3121 | sk_wmem_free_skb(sk, skb); | 3136 | sk_wmem_free_skb(sk, skb); |
3122 | if (skb == tp->retransmit_skb_hint) | 3137 | if (unlikely(skb == tp->retransmit_skb_hint)) |
3123 | tp->retransmit_skb_hint = NULL; | 3138 | tp->retransmit_skb_hint = NULL; |
3124 | if (skb == tp->lost_skb_hint) | 3139 | if (unlikely(skb == tp->lost_skb_hint)) |
3125 | tp->lost_skb_hint = NULL; | 3140 | tp->lost_skb_hint = NULL; |
3126 | } | 3141 | } |
3127 | 3142 | ||
@@ -3132,7 +3147,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3132 | flag |= FLAG_SACK_RENEGING; | 3147 | flag |= FLAG_SACK_RENEGING; |
3133 | 3148 | ||
3134 | skb_mstamp_get(&now); | 3149 | skb_mstamp_get(&now); |
3135 | if (first_ackt.v64) { | 3150 | if (likely(first_ackt.v64)) { |
3136 | seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); | 3151 | seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); |
3137 | ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); | 3152 | ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); |
3138 | } | 3153 | } |
@@ -3394,6 +3409,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3394 | int acked = 0; /* Number of packets newly acked */ | 3409 | int acked = 0; /* Number of packets newly acked */ |
3395 | long sack_rtt_us = -1L; | 3410 | long sack_rtt_us = -1L; |
3396 | 3411 | ||
3412 | /* We very likely will need to access write queue head. */ | ||
3413 | prefetchw(sk->sk_write_queue.next); | ||
3414 | |||
3397 | /* If the ack is older than previous acks | 3415 | /* If the ack is older than previous acks |
3398 | * then we can probably ignore it. | 3416 | * then we can probably ignore it. |
3399 | */ | 3417 | */ |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 552e87e3c269..94d1a7757ff7 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -880,26 +880,6 @@ bool tcp_syn_flood_action(struct sock *sk, | |||
880 | } | 880 | } |
881 | EXPORT_SYMBOL(tcp_syn_flood_action); | 881 | EXPORT_SYMBOL(tcp_syn_flood_action); |
882 | 882 | ||
883 | /* | ||
884 | * Save and compile IPv4 options into the request_sock if needed. | ||
885 | */ | ||
886 | static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) | ||
887 | { | ||
888 | const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; | ||
889 | struct ip_options_rcu *dopt = NULL; | ||
890 | |||
891 | if (opt && opt->optlen) { | ||
892 | int opt_size = sizeof(*dopt) + opt->optlen; | ||
893 | |||
894 | dopt = kmalloc(opt_size, GFP_ATOMIC); | ||
895 | if (dopt && __ip_options_echo(&dopt->opt, skb, opt)) { | ||
896 | kfree(dopt); | ||
897 | dopt = NULL; | ||
898 | } | ||
899 | } | ||
900 | return dopt; | ||
901 | } | ||
902 | |||
903 | #ifdef CONFIG_TCP_MD5SIG | 883 | #ifdef CONFIG_TCP_MD5SIG |
904 | /* | 884 | /* |
905 | * RFC2385 MD5 checksumming requires a mapping of | 885 | * RFC2385 MD5 checksumming requires a mapping of |
@@ -1428,7 +1408,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) | |||
1428 | 1408 | ||
1429 | #ifdef CONFIG_SYN_COOKIES | 1409 | #ifdef CONFIG_SYN_COOKIES |
1430 | if (!th->syn) | 1410 | if (!th->syn) |
1431 | sk = cookie_v4_check(sk, skb, &TCP_SKB_CB(skb)->header.h4.opt); | 1411 | sk = cookie_v4_check(sk, skb); |
1432 | #endif | 1412 | #endif |
1433 | return sk; | 1413 | return sk; |
1434 | } | 1414 | } |
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index becd98ce9a1c..3af21296d967 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -839,26 +839,38 @@ void tcp_wfree(struct sk_buff *skb) | |||
839 | { | 839 | { |
840 | struct sock *sk = skb->sk; | 840 | struct sock *sk = skb->sk; |
841 | struct tcp_sock *tp = tcp_sk(sk); | 841 | struct tcp_sock *tp = tcp_sk(sk); |
842 | int wmem; | ||
843 | |||
844 | /* Keep one reference on sk_wmem_alloc. | ||
845 | * Will be released by sk_free() from here or tcp_tasklet_func() | ||
846 | */ | ||
847 | wmem = atomic_sub_return(skb->truesize - 1, &sk->sk_wmem_alloc); | ||
848 | |||
849 | /* If this softirq is serviced by ksoftirqd, we are likely under stress. | ||
850 | * Wait until our queues (qdisc + devices) are drained. | ||
851 | * This gives : | ||
852 | * - less callbacks to tcp_write_xmit(), reducing stress (batches) | ||
853 | * - chance for incoming ACK (processed by another cpu maybe) | ||
854 | * to migrate this flow (skb->ooo_okay will be eventually set) | ||
855 | */ | ||
856 | if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current) | ||
857 | goto out; | ||
842 | 858 | ||
843 | if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) && | 859 | if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) && |
844 | !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) { | 860 | !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) { |
845 | unsigned long flags; | 861 | unsigned long flags; |
846 | struct tsq_tasklet *tsq; | 862 | struct tsq_tasklet *tsq; |
847 | 863 | ||
848 | /* Keep a ref on socket. | ||
849 | * This last ref will be released in tcp_tasklet_func() | ||
850 | */ | ||
851 | atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc); | ||
852 | |||
853 | /* queue this socket to tasklet queue */ | 864 | /* queue this socket to tasklet queue */ |
854 | local_irq_save(flags); | 865 | local_irq_save(flags); |
855 | tsq = this_cpu_ptr(&tsq_tasklet); | 866 | tsq = this_cpu_ptr(&tsq_tasklet); |
856 | list_add(&tp->tsq_node, &tsq->head); | 867 | list_add(&tp->tsq_node, &tsq->head); |
857 | tasklet_schedule(&tsq->tasklet); | 868 | tasklet_schedule(&tsq->tasklet); |
858 | local_irq_restore(flags); | 869 | local_irq_restore(flags); |
859 | } else { | 870 | return; |
860 | sock_wfree(skb); | ||
861 | } | 871 | } |
872 | out: | ||
873 | sk_free(sk); | ||
862 | } | 874 | } |
863 | 875 | ||
864 | /* This routine actually transmits TCP packets queued in by | 876 | /* This routine actually transmits TCP packets queued in by |
@@ -914,9 +926,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
914 | tcp_ca_event(sk, CA_EVENT_TX_START); | 926 | tcp_ca_event(sk, CA_EVENT_TX_START); |
915 | 927 | ||
916 | /* if no packet is in qdisc/device queue, then allow XPS to select | 928 | /* if no packet is in qdisc/device queue, then allow XPS to select |
917 | * another queue. | 929 | * another queue. We can be called from tcp_tsq_handler() |
930 | * which holds one reference to sk_wmem_alloc. | ||
931 | * | ||
932 | * TODO: Ideally, in-flight pure ACK packets should not matter here. | ||
933 | * One way to get this would be to set skb->truesize = 2 on them. | ||
918 | */ | 934 | */ |
919 | skb->ooo_okay = sk_wmem_alloc_get(sk) == 0; | 935 | skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1); |
920 | 936 | ||
921 | skb_push(skb, tcp_header_size); | 937 | skb_push(skb, tcp_header_size); |
922 | skb_reset_transport_header(skb); | 938 | skb_reset_transport_header(skb); |