tcp: use RACK to detect losses

This patch implements the second half of RACK that uses the the most recent transmit time among all delivered packets to detect losses. tcp_rack_mark_lost() is called upon receiving a dubious ACK. It then checks if an not-yet-sacked packet was sent at least "reo_wnd" prior to the sent time of the most recently delivered. If so the packet is deemed lost. The "reo_wnd" reordering window starts with 1msec for fast loss detection and changes to min-RTT/4 when reordering is observed. We found 1msec accommodates well on tiny degree of reordering (<3 pkts) on faster links. We use min-RTT instead of SRTT because reordering is more of a path property but SRTT can be inflated by self-inflicated congestion. The factor of 4 is borrowed from the delayed early retransmit and seems to work reasonably well. Since RACK is still experimental, it is now used as a supplemental loss detection on top of existing algorithms. It is only effective after the fast recovery starts or after the timeout occurs. The fast recovery is still triggered by FACK and/or dupack threshold instead of RACK. We introduce a new sysctl net.ipv4.tcp_recovery for future experiments of loss recoveries. For now RACK can be disabled by setting it to 0. Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Yuchung Cheng <ycheng@google.com> 2015-10-17 00:57:47 -0400
committer: David S. Miller <davem@davemloft.net> 2015-10-21 10:00:53 -0400
commit: 4f41b1c58a32537542f14c1150099131613a5e8a (patch)
tree: 86f37587abdfeee11ad36c75dcae37adf8aa091f
parent: 659a8ad56f490279f0efee43a62ffa1ac914a4e0 (diff)
5 files changed, 109 insertions, 2 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 502d6a572b4f..85752c81c5ec 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -433,6 +433,15 @@ tcp_orphan_retries - INTEGER
        you should think about lowering this value, such sockets
        may consume significant resources. Cf. tcp_max_orphans.
+tcp_recovery - INTEGER
+        This value is a bitmap to enable various experimental loss recovery
+        features.
+        RACK: 0x1 enables the RACK loss detection for fast detection of lost
+              retransmissions and tail drops.
+        Default: 0x1
 tcp_reordering - INTEGER
        Initial reordering level of packets in a TCP stream.
        TCP stack can then dynamically adjust flow reordering level
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3c3a9fe057d3..11e320412216 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -567,6 +567,7 @@ void tcp_resume_early_retransmit(struct sock *sk);
 void tcp_rearm_rto(struct sock *sk);
 void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
 void tcp_reset(struct sock *sk);
+void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb);
 /* tcp_timer.c */
 void tcp_init_xmit_timers(struct sock *);
@@ -1752,6 +1753,14 @@ void tcp_init(void);
 /* tcp_recovery.c */
+/* Flags to enable various loss recovery features. See below */
+extern int sysctl_tcp_recovery;
+/* Use TCP RACK to detect (some) tail and retransmit losses */
+#define TCP_RACK_LOST_RETRANS  0x1
+extern int tcp_rack_mark_lost(struct sock *sk);
 extern void tcp_rack_advance(struct tcp_sock *tp,
                             const struct skb_mstamp *xmit_time, u8 sacked);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 13ab434c2909..25300c5e283b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -496,6 +496,13 @@ static struct ctl_table ipv4_table[] = {
                .proc_handler   = proc_dointvec
        },
        {
+                .procname       = "tcp_recovery",
+                .data           = &sysctl_tcp_recovery,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
+        {
                .procname       = "tcp_reordering",
                .data           = &sysctl_tcp_reordering,
                .maxlen         = sizeof(int),
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ce8370525832..fdd88c3803a6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -881,6 +881,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
        if (metric > 0)
                tcp_disable_early_retrans(tp);
+        tp->rack.reord = 1;
 }
 /* This must be called before lost_out is incremented */
@@ -906,8 +907,7 @@ static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
        }
 }
-static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
+void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
-                                            struct sk_buff *skb)
 {
        tcp_verify_retransmit_hint(tp, skb);
@@ -2806,6 +2806,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
                }
        }
+        /* Use RACK to detect loss */
+        if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS &&
+            tcp_rack_mark_lost(sk))
+                flag |= FLAG_LOST_RETRANS;
        /* E. Process state. */
        switch (icsk->icsk_ca_state) {
        case TCP_CA_Recovery:
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 8f66a6584845..5353085fd0b2 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -1,6 +1,83 @@
 #include <linux/tcp.h>
 #include <net/tcp.h>
+int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;
+/* Marks a packet lost, if some packet sent later has been (s)acked.
+ * The underlying idea is similar to the traditional dupthresh and FACK
+ * but they look at different metrics:
+ *
+ * dupthresh: 3 OOO packets delivered (packet count)
+ * FACK: sequence delta to highest sacked sequence (sequence space)
+ * RACK: sent time delta to the latest delivered packet (time domain)
+ *
+ * The advantage of RACK is it applies to both original and retransmitted
+ * packet and therefore is robust against tail losses. Another advantage
+ * is being more resilient to reordering by simply allowing some
+ * "settling delay", instead of tweaking the dupthresh.
+ *
+ * The current version is only used after recovery starts but can be
+ * easily extended to detect the first loss.
+ */
+int tcp_rack_mark_lost(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct sk_buff *skb;
+        u32 reo_wnd, prior_retrans = tp->retrans_out;
+        if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
+                return 0;
+        /* Reset the advanced flag to avoid unnecessary queue scanning */
+        tp->rack.advanced = 0;
+        /* To be more reordering resilient, allow min_rtt/4 settling delay
+         * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
+         * RTT because reordering is often a path property and less related
+         * to queuing or delayed ACKs.
+         *
+         * TODO: measure and adapt to the observed reordering delay, and
+         * use a timer to retransmit like the delayed early retransmit.
+         */
+        reo_wnd = 1000;
+        if (tp->rack.reord && tcp_min_rtt(tp) != ~0U)
+                reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
+        tcp_for_write_queue(skb, sk) {
+                struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+                if (skb == tcp_send_head(sk))
+                        break;
+                /* Skip ones already (s)acked */
+                if (!after(scb->end_seq, tp->snd_una) ||
+                    scb->sacked & TCPCB_SACKED_ACKED)
+                        continue;
+                if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) {
+                        if (skb_mstamp_us_delta(&tp->rack.mstamp,
+                                                &skb->skb_mstamp) <= reo_wnd)
+                                continue;
+                        /* skb is lost if packet sent later is sacked */
+                        tcp_skb_mark_lost_uncond_verify(tp, skb);
+                        if (scb->sacked & TCPCB_SACKED_RETRANS) {
+                                scb->sacked &= ~TCPCB_SACKED_RETRANS;
+                                tp->retrans_out -= tcp_skb_pcount(skb);
+                                NET_INC_STATS_BH(sock_net(sk),
+                                                 LINUX_MIB_TCPLOSTRETRANSMIT);
+                        }
+                } else if (!(scb->sacked & TCPCB_RETRANS)) {
+                        /* Original data are sent sequentially so stop early
+                         * b/c the rest are all sent after rack_sent
+                         */
+                        break;
+                }
+        }
+        return prior_retrans - tp->retrans_out;
+}
 /* Record the most recently (re)sent time among the (s)acked packets */
 void tcp_rack_advance(struct tcp_sock *tp,
                      const struct skb_mstamp *xmit_time, u8 sacked)
author	Yuchung Cheng <ycheng@google.com>	2015-10-17 00:57:47 -0400
committer	David S. Miller <davem@davemloft.net>	2015-10-21 10:00:53 -0400
commit	4f41b1c58a32537542f14c1150099131613a5e8a (patch)
tree	86f37587abdfeee11ad36c75dcae37adf8aa091f
parent	659a8ad56f490279f0efee43a62ffa1ac914a4e0 (diff)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 502d6a572b4f..85752c81c5ec 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt
@@ -433,6 +433,15 @@ tcp_orphan_retries - INTEGER
433	you should think about lowering this value, such sockets	433	you should think about lowering this value, such sockets
434	may consume significant resources. Cf. tcp_max_orphans.	434	may consume significant resources. Cf. tcp_max_orphans.
435		435
		436	tcp_recovery - INTEGER
		437	This value is a bitmap to enable various experimental loss recovery
		438	features.
		439
		440	RACK: 0x1 enables the RACK loss detection for fast detection of lost
		441	retransmissions and tail drops.
		442
		443	Default: 0x1
		444
436	tcp_reordering - INTEGER	445	tcp_reordering - INTEGER
437	Initial reordering level of packets in a TCP stream.	446	Initial reordering level of packets in a TCP stream.
438	TCP stack can then dynamically adjust flow reordering level	447	TCP stack can then dynamically adjust flow reordering level


diff --git a/include/net/tcp.h b/include/net/tcp.h index 3c3a9fe057d3..11e320412216 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h
@@ -567,6 +567,7 @@ void tcp_resume_early_retransmit(struct sock *sk);
567	void tcp_rearm_rto(struct sock *sk);	567	void tcp_rearm_rto(struct sock *sk);
568	void tcp_synack_rtt_meas(struct sock sk, struct request_sock req);	568	void tcp_synack_rtt_meas(struct sock sk, struct request_sock req);
569	void tcp_reset(struct sock *sk);	569	void tcp_reset(struct sock *sk);
		570	void tcp_skb_mark_lost_uncond_verify(struct tcp_sock tp, struct sk_buff skb);
570		571
571	/* tcp_timer.c */	572	/* tcp_timer.c */
572	void tcp_init_xmit_timers(struct sock *);	573	void tcp_init_xmit_timers(struct sock *);
@@ -1752,6 +1753,14 @@ void tcp_init(void);
1752		1753
1753	/* tcp_recovery.c */	1754	/* tcp_recovery.c */
1754		1755
		1756	/* Flags to enable various loss recovery features. See below */
		1757	extern int sysctl_tcp_recovery;
		1758
		1759	/* Use TCP RACK to detect (some) tail and retransmit losses */
		1760	#define TCP_RACK_LOST_RETRANS 0x1
		1761
		1762	extern int tcp_rack_mark_lost(struct sock *sk);
		1763
1755	extern void tcp_rack_advance(struct tcp_sock *tp,	1764	extern void tcp_rack_advance(struct tcp_sock *tp,
1756	const struct skb_mstamp *xmit_time, u8 sacked);	1765	const struct skb_mstamp *xmit_time, u8 sacked);
1757		1766


diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 13ab434c2909..25300c5e283b 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c
@@ -496,6 +496,13 @@ static struct ctl_table ipv4_table[] = {
496	.proc_handler = proc_dointvec	496	.proc_handler = proc_dointvec
497	},	497	},
498	{	498	{
		499	.procname = "tcp_recovery",
		500	.data = &sysctl_tcp_recovery,
		501	.maxlen = sizeof(int),
		502	.mode = 0644,
		503	.proc_handler = proc_dointvec,
		504	},
		505	{
499	.procname = "tcp_reordering",	506	.procname = "tcp_reordering",
500	.data = &sysctl_tcp_reordering,	507	.data = &sysctl_tcp_reordering,
501	.maxlen = sizeof(int),	508	.maxlen = sizeof(int),


diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ce8370525832..fdd88c3803a6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c
@@ -881,6 +881,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
881		881
882	if (metric > 0)	882	if (metric > 0)
883	tcp_disable_early_retrans(tp);	883	tcp_disable_early_retrans(tp);
		884	tp->rack.reord = 1;
884	}	885	}
885		886
886	/* This must be called before lost_out is incremented */	887	/* This must be called before lost_out is incremented */
@@ -906,8 +907,7 @@ static void tcp_skb_mark_lost(struct tcp_sock tp, struct sk_buff skb)
906	}	907	}
907	}	908	}
908		909
909	static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,	910	void tcp_skb_mark_lost_uncond_verify(struct tcp_sock tp, struct sk_buff skb)
910	struct sk_buff *skb)
911	{	911	{
912	tcp_verify_retransmit_hint(tp, skb);	912	tcp_verify_retransmit_hint(tp, skb);
913		913
@@ -2806,6 +2806,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
2806	}	2806	}
2807	}	2807	}
2808		2808
		2809	/* Use RACK to detect loss */
		2810	if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS &&
		2811	tcp_rack_mark_lost(sk))
		2812	flag \|= FLAG_LOST_RETRANS;
		2813
2809	/* E. Process state. */	2814	/* E. Process state. */
2810	switch (icsk->icsk_ca_state) {	2815	switch (icsk->icsk_ca_state) {
2811	case TCP_CA_Recovery:	2816	case TCP_CA_Recovery:


diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 8f66a6584845..5353085fd0b2 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c
@@ -1,6 +1,83 @@
1	#include <linux/tcp.h>	1	#include <linux/tcp.h>
2	#include <net/tcp.h>	2	#include <net/tcp.h>
3		3
		4	int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;
		5
		6	/* Marks a packet lost, if some packet sent later has been (s)acked.
		7	* The underlying idea is similar to the traditional dupthresh and FACK
		8	* but they look at different metrics:
		9	*
		10	* dupthresh: 3 OOO packets delivered (packet count)
		11	* FACK: sequence delta to highest sacked sequence (sequence space)
		12	* RACK: sent time delta to the latest delivered packet (time domain)
		13	*
		14	* The advantage of RACK is it applies to both original and retransmitted
		15	* packet and therefore is robust against tail losses. Another advantage
		16	* is being more resilient to reordering by simply allowing some
		17	* "settling delay", instead of tweaking the dupthresh.
		18	*
		19	* The current version is only used after recovery starts but can be
		20	* easily extended to detect the first loss.
		21	*/
		22	int tcp_rack_mark_lost(struct sock *sk)
		23	{
		24	struct tcp_sock *tp = tcp_sk(sk);
		25	struct sk_buff *skb;
		26	u32 reo_wnd, prior_retrans = tp->retrans_out;
		27
		28	if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery \|\| !tp->rack.advanced)
		29	return 0;
		30
		31	/* Reset the advanced flag to avoid unnecessary queue scanning */
		32	tp->rack.advanced = 0;
		33
		34	/* To be more reordering resilient, allow min_rtt/4 settling delay
		35	* (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
		36	* RTT because reordering is often a path property and less related
		37	* to queuing or delayed ACKs.
		38	*
		39	* TODO: measure and adapt to the observed reordering delay, and
		40	* use a timer to retransmit like the delayed early retransmit.
		41	*/
		42	reo_wnd = 1000;
		43	if (tp->rack.reord && tcp_min_rtt(tp) != ~0U)
		44	reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
		45
		46	tcp_for_write_queue(skb, sk) {
		47	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
		48
		49	if (skb == tcp_send_head(sk))
		50	break;
		51
		52	/* Skip ones already (s)acked */
		53	if (!after(scb->end_seq, tp->snd_una) \|\|
		54	scb->sacked & TCPCB_SACKED_ACKED)
		55	continue;
		56
		57	if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) {
		58
		59	if (skb_mstamp_us_delta(&tp->rack.mstamp,
		60	&skb->skb_mstamp) <= reo_wnd)
		61	continue;
		62
		63	/* skb is lost if packet sent later is sacked */
		64	tcp_skb_mark_lost_uncond_verify(tp, skb);
		65	if (scb->sacked & TCPCB_SACKED_RETRANS) {
		66	scb->sacked &= ~TCPCB_SACKED_RETRANS;
		67	tp->retrans_out -= tcp_skb_pcount(skb);
		68	NET_INC_STATS_BH(sock_net(sk),
		69	LINUX_MIB_TCPLOSTRETRANSMIT);
		70	}
		71	} else if (!(scb->sacked & TCPCB_RETRANS)) {
		72	/* Original data are sent sequentially so stop early
		73	* b/c the rest are all sent after rack_sent
		74	*/
		75	break;
		76	}
		77	}
		78	return prior_retrans - tp->retrans_out;
		79	}
		80
4	/* Record the most recently (re)sent time among the (s)acked packets */	81	/* Record the most recently (re)sent time among the (s)acked packets */
5	void tcp_rack_advance(struct tcp_sock *tp,	82	void tcp_rack_advance(struct tcp_sock *tp,
6	const struct skb_mstamp *xmit_time, u8 sacked)	83	const struct skb_mstamp *xmit_time, u8 sacked)