aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_output.c
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2014-04-20 20:58:17 -0400
committerDavid S. Miller <davem@davemloft.net>2014-04-22 21:27:57 -0400
commit1f3279ae0c13cd742731726b0ed195d5f09b14e4 (patch)
tree48cabf2c099db7586abdbf22ac92c1bcb2e1cb89 /net/ipv4/tcp_output.c
parent6046d5b4e464ba9b2cc8f0407069456624598dd5 (diff)
tcp: avoid retransmits of TCP packets hanging in host queues
In commit 0e280af026a5 ("tcp: introduce TCPSpuriousRtxHostQueues SNMP counter") we added a logic to detect when a packet was retransmitted while the prior clone was still in a qdisc or driver queue. We are now confident we can do better, and catch the problem before we fragment a TSO packet before retransmit, or in TLP path. This patch fully exploits the logic by simply canceling the spurious retransmit. Original packet is in a queue and will eventually leave the host. This helps to avoid network collapses when some events make the RTO estimations very wrong, particularly when dealing with huge number of sockets with synchronized blast. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r--net/ipv4/tcp_output.c34
1 files changed, 26 insertions, 8 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 29dde97c3c41..20847de991ea 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -878,15 +878,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
878 BUG_ON(!skb || !tcp_skb_pcount(skb)); 878 BUG_ON(!skb || !tcp_skb_pcount(skb));
879 879
880 if (clone_it) { 880 if (clone_it) {
881 const struct sk_buff *fclone = skb + 1;
882
883 skb_mstamp_get(&skb->skb_mstamp); 881 skb_mstamp_get(&skb->skb_mstamp);
884 882
885 if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
886 fclone->fclone == SKB_FCLONE_CLONE))
887 NET_INC_STATS(sock_net(sk),
888 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
889
890 if (unlikely(skb_cloned(skb))) 883 if (unlikely(skb_cloned(skb)))
891 skb = pskb_copy(skb, gfp_mask); 884 skb = pskb_copy(skb, gfp_mask);
892 else 885 else
@@ -2061,6 +2054,25 @@ bool tcp_schedule_loss_probe(struct sock *sk)
2061 return true; 2054 return true;
2062} 2055}
2063 2056
2057/* Thanks to skb fast clones, we can detect if a prior transmit of
2058 * a packet is still in a qdisc or driver queue.
2059 * In this case, there is very little point doing a retransmit !
2060 * Note: This is called from BH context only.
2061 */
2062static bool skb_still_in_host_queue(const struct sock *sk,
2063 const struct sk_buff *skb)
2064{
2065 const struct sk_buff *fclone = skb + 1;
2066
2067 if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
2068 fclone->fclone == SKB_FCLONE_CLONE)) {
2069 NET_INC_STATS_BH(sock_net(sk),
2070 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
2071 return true;
2072 }
2073 return false;
2074}
2075
2064/* When probe timeout (PTO) fires, send a new segment if one exists, else 2076/* When probe timeout (PTO) fires, send a new segment if one exists, else
2065 * retransmit the last segment. 2077 * retransmit the last segment.
2066 */ 2078 */
@@ -2086,6 +2098,9 @@ void tcp_send_loss_probe(struct sock *sk)
2086 if (WARN_ON(!skb)) 2098 if (WARN_ON(!skb))
2087 goto rearm_timer; 2099 goto rearm_timer;
2088 2100
2101 if (skb_still_in_host_queue(sk, skb))
2102 goto rearm_timer;
2103
2089 pcount = tcp_skb_pcount(skb); 2104 pcount = tcp_skb_pcount(skb);
2090 if (WARN_ON(!pcount)) 2105 if (WARN_ON(!pcount))
2091 goto rearm_timer; 2106 goto rearm_timer;
@@ -2407,6 +2422,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2407 min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) 2422 min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
2408 return -EAGAIN; 2423 return -EAGAIN;
2409 2424
2425 if (skb_still_in_host_queue(sk, skb))
2426 return -EBUSY;
2427
2410 if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { 2428 if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
2411 if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) 2429 if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
2412 BUG(); 2430 BUG();
@@ -2500,7 +2518,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2500 * see tcp_input.c tcp_sacktag_write_queue(). 2518 * see tcp_input.c tcp_sacktag_write_queue().
2501 */ 2519 */
2502 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; 2520 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
2503 } else { 2521 } else if (err != -EBUSY) {
2504 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); 2522 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
2505 } 2523 }
2506 return err; 2524 return err;