tcp: reduce spurious retransmits due to transient SACK reneging

This commit reduces spurious retransmits due to apparent SACK reneging by only reacting to SACK reneging that persists for a short delay. When a sequence space hole at snd_una is filled, some TCP receivers send a series of ACKs as they apparently scan their out-of-order queue and cumulatively ACK all the packets that have now been consecutiveyly received. This is essentially misbehavior B in "Misbehaviors in TCP SACK generation" ACM SIGCOMM Computer Communication Review, April 2011, so we suspect that this is from several common OSes (Windows 2000, Windows Server 2003, Windows XP). However, this issue has also been seen in other cases, e.g. the netdev thread "TCP being hoodwinked into spurious retransmissions by lack of timestamps?" from March 2014, where the receiver was thought to be a BSD box. Since snd_una would temporarily be adjacent to a previously SACKed range in these scenarios, this receiver behavior triggered the Linux SACK reneging code path in the sender. This led the sender to clear the SACK scoreboard, enter CA_Loss, and spuriously retransmit (potentially) every packet from the entire write queue at line rate just a few milliseconds before the ACK for each packet arrives at the sender. To avoid such situations, now when a sender sees apparent reneging it does not yet retransmit, but rather adjusts the RTO timer to give the receiver a little time (max(RTT/2, 10ms)) to send us some more ACKs that will restore sanity to the SACK scoreboard. If the reneging persists until this RTO then, as before, we clear the SACK scoreboard and enter CA_Loss. A 10ms delay tolerates a receiver sending such a stream of ACKs at 56Kbit/sec. And to allow for receivers with slower or more congested paths, we wait for at least RTT/2. We validated the resulting max(RTT/2, 10ms) delay formula with a mix of North American and South American Google web server traffic, and found that for ACKs displaying transient reneging: (1) 90% of inter-ACK delays were less than 10ms (2) 99% of inter-ACK delays were less than RTT/2 In tests on Google web servers this commit reduced reneging events by 75%-90% (as measured by the TcpExtTCPSACKReneging counter), without any measurable impact on latency for user HTTP and SPDY requests. Signed-off-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Neal Cardwell <ncardwell@google.com> 2014-08-04 19:12:29 -0400
committer: David S. Miller <davem@davemloft.net> 2014-08-05 19:29:33 -0400
commit: 5ae344c949e79b8545a11db149f0a85a6e59e1f3 (patch)
tree: 62df3bb1b737e263e28f204c90da77188ba907bd /net/ipv4
parent: 61675fea33350d2d9b3b5f64b498dc88ee59c695 (diff)
2 files changed, 20 insertions, 13 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7832d941dbcd..6a2984507755 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1904,16 +1904,17 @@ void tcp_clear_retrans(struct tcp_sock *tp)
        tp->sacked_out = 0;
 }
-/* Enter Loss state. If "how" is not zero, forget all SACK information
+/* Enter Loss state. If we detect SACK reneging, forget all SACK information
 * and reset tags completely, otherwise preserve SACKs. If receiver
 * dropped its ofo queue, we will know this due to reneging detection.
 */
-void tcp_enter_loss(struct sock *sk, int how)
+void tcp_enter_loss(struct sock *sk)
 {
        const struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
        bool new_recovery = false;
+        bool is_reneg;                  /* is receiver reneging on SACKs? */
        /* Reduce ssthresh if it has not yet been made inside this window. */
        if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
@@ -1934,7 +1935,11 @@ void tcp_enter_loss(struct sock *sk, int how)
                tcp_reset_reno_sack(tp);
        tp->undo_marker = tp->snd_una;
-        if (how) {
+        skb = tcp_write_queue_head(sk);
+        is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
+        if (is_reneg) {
+                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
                tp->sacked_out = 0;
                tp->fackets_out = 0;
        }
@@ -1948,7 +1953,7 @@ void tcp_enter_loss(struct sock *sk, int how)
                        tp->undo_marker = 0;
                TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
-                if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) {
+                if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) {
                        TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
                        TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
                        tp->lost_out += tcp_skb_pcount(skb);
@@ -1981,19 +1986,21 @@ void tcp_enter_loss(struct sock *sk, int how)
 * remembered SACKs do not reflect real state of receiver i.e.
 * receiver _host_ is heavily congested (or buggy).
 *
- * Do processing similar to RTO timeout.
+ * To avoid big spurious retransmission bursts due to transient SACK
+ * scoreboard oddities that look like reneging, we give the receiver a
+ * little time (max(RTT/2, 10ms)) to send us some more ACKs that will
+ * restore sanity to the SACK scoreboard. If the apparent reneging
+ * persists until this RTO then we'll clear the SACK scoreboard.
 */
 static bool tcp_check_sack_reneging(struct sock *sk, int flag)
 {
        if (flag & FLAG_SACK_RENEGING) {
-                struct inet_connection_sock *icsk = inet_csk(sk);
+                struct tcp_sock *tp = tcp_sk(sk);
-                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
+                unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
+                                          msecs_to_jiffies(10));
-                tcp_enter_loss(sk, 1);
-                icsk->icsk_retransmits++;
-                tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
                inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
-                                          icsk->icsk_rto, TCP_RTO_MAX);
+                                          delay, TCP_RTO_MAX);
                return true;
        }
        return false;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 286227abed10..df90cd1ce37f 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -391,7 +391,7 @@ void tcp_retransmit_timer(struct sock *sk)
                        tcp_write_err(sk);
                        goto out;
                }
-                tcp_enter_loss(sk, 0);
+                tcp_enter_loss(sk);
                tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
                __sk_dst_reset(sk);
                goto out_reset_timer;
@@ -422,7 +422,7 @@ void tcp_retransmit_timer(struct sock *sk)
                NET_INC_STATS_BH(sock_net(sk), mib_idx);
        }
-        tcp_enter_loss(sk, 0);
+        tcp_enter_loss(sk);
        if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) {
                /* Retransmission failed because of local congestion,
author	Neal Cardwell <ncardwell@google.com>	2014-08-04 19:12:29 -0400
committer	David S. Miller <davem@davemloft.net>	2014-08-05 19:29:33 -0400
commit	5ae344c949e79b8545a11db149f0a85a6e59e1f3 (patch)
tree	62df3bb1b737e263e28f204c90da77188ba907bd /net/ipv4
parent	61675fea33350d2d9b3b5f64b498dc88ee59c695 (diff)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7832d941dbcd..6a2984507755 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c
@@ -1904,16 +1904,17 @@ void tcp_clear_retrans(struct tcp_sock *tp)
1904	tp->sacked_out = 0;	1904	tp->sacked_out = 0;
1905	}	1905	}
1906		1906
1907	/* Enter Loss state. If "how" is not zero, forget all SACK information	1907	/* Enter Loss state. If we detect SACK reneging, forget all SACK information
1908	* and reset tags completely, otherwise preserve SACKs. If receiver	1908	* and reset tags completely, otherwise preserve SACKs. If receiver
1909	* dropped its ofo queue, we will know this due to reneging detection.	1909	* dropped its ofo queue, we will know this due to reneging detection.
1910	*/	1910	*/
1911	void tcp_enter_loss(struct sock *sk, int how)	1911	void tcp_enter_loss(struct sock *sk)
1912	{	1912	{
1913	const struct inet_connection_sock *icsk = inet_csk(sk);	1913	const struct inet_connection_sock *icsk = inet_csk(sk);
1914	struct tcp_sock *tp = tcp_sk(sk);	1914	struct tcp_sock *tp = tcp_sk(sk);
1915	struct sk_buff *skb;	1915	struct sk_buff *skb;
1916	bool new_recovery = false;	1916	bool new_recovery = false;
		1917	bool is_reneg; /* is receiver reneging on SACKs? */
1917		1918
1918	/* Reduce ssthresh if it has not yet been made inside this window. */	1919	/* Reduce ssthresh if it has not yet been made inside this window. */
1919	if (icsk->icsk_ca_state <= TCP_CA_Disorder \|\|	1920	if (icsk->icsk_ca_state <= TCP_CA_Disorder \|\|
@@ -1934,7 +1935,11 @@ void tcp_enter_loss(struct sock *sk, int how)
1934	tcp_reset_reno_sack(tp);	1935	tcp_reset_reno_sack(tp);
1935		1936
1936	tp->undo_marker = tp->snd_una;	1937	tp->undo_marker = tp->snd_una;
1937	if (how) {	1938
		1939	skb = tcp_write_queue_head(sk);
		1940	is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
		1941	if (is_reneg) {
		1942	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
1938	tp->sacked_out = 0;	1943	tp->sacked_out = 0;
1939	tp->fackets_out = 0;	1944	tp->fackets_out = 0;
1940	}	1945	}
@@ -1948,7 +1953,7 @@ void tcp_enter_loss(struct sock *sk, int how)
1948	tp->undo_marker = 0;	1953	tp->undo_marker = 0;
1949		1954
1950	TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)\|TCPCB_SACKED_ACKED;	1955	TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)\|TCPCB_SACKED_ACKED;
1951	if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) \|\| how) {	1956	if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) \|\| is_reneg) {
1952	TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;	1957	TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
1953	TCP_SKB_CB(skb)->sacked \|= TCPCB_LOST;	1958	TCP_SKB_CB(skb)->sacked \|= TCPCB_LOST;
1954	tp->lost_out += tcp_skb_pcount(skb);	1959	tp->lost_out += tcp_skb_pcount(skb);
@@ -1981,19 +1986,21 @@ void tcp_enter_loss(struct sock *sk, int how)
1981	* remembered SACKs do not reflect real state of receiver i.e.	1986	* remembered SACKs do not reflect real state of receiver i.e.
1982	* receiver _host_ is heavily congested (or buggy).	1987	* receiver _host_ is heavily congested (or buggy).
1983	*	1988	*
1984	* Do processing similar to RTO timeout.	1989	* To avoid big spurious retransmission bursts due to transient SACK
		1990	* scoreboard oddities that look like reneging, we give the receiver a
		1991	* little time (max(RTT/2, 10ms)) to send us some more ACKs that will
		1992	* restore sanity to the SACK scoreboard. If the apparent reneging
		1993	* persists until this RTO then we'll clear the SACK scoreboard.
1985	*/	1994	*/
1986	static bool tcp_check_sack_reneging(struct sock *sk, int flag)	1995	static bool tcp_check_sack_reneging(struct sock *sk, int flag)
1987	{	1996	{
1988	if (flag & FLAG_SACK_RENEGING) {	1997	if (flag & FLAG_SACK_RENEGING) {
1989	struct inet_connection_sock *icsk = inet_csk(sk);	1998	struct tcp_sock *tp = tcp_sk(sk);
1990	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);	1999	unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
		2000	msecs_to_jiffies(10));
1991		2001
1992	tcp_enter_loss(sk, 1);
1993	icsk->icsk_retransmits++;
1994	tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
1995	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,	2002	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
1996	icsk->icsk_rto, TCP_RTO_MAX);	2003	delay, TCP_RTO_MAX);
1997	return true;	2004	return true;
1998	}	2005	}
1999	return false;	2006	return false;


diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 286227abed10..df90cd1ce37f 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c
@@ -391,7 +391,7 @@ void tcp_retransmit_timer(struct sock *sk)
391	tcp_write_err(sk);	391	tcp_write_err(sk);
392	goto out;	392	goto out;
393	}	393	}
394	tcp_enter_loss(sk, 0);	394	tcp_enter_loss(sk);
395	tcp_retransmit_skb(sk, tcp_write_queue_head(sk));	395	tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
396	__sk_dst_reset(sk);	396	__sk_dst_reset(sk);
397	goto out_reset_timer;	397	goto out_reset_timer;
@@ -422,7 +422,7 @@ void tcp_retransmit_timer(struct sock *sk)
422	NET_INC_STATS_BH(sock_net(sk), mib_idx);	422	NET_INC_STATS_BH(sock_net(sk), mib_idx);
423	}	423	}
424		424
425	tcp_enter_loss(sk, 0);	425	tcp_enter_loss(sk);
426		426
427	if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) {	427	if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) {
428	/* Retransmission failed because of local congestion,	428	/* Retransmission failed because of local congestion,