tcp: Tail loss probe (TLP)

This patch series implement the Tail loss probe (TLP) algorithm described in http://tools.ietf.org/html/draft-dukkipati-tcpm-tcp-loss-probe-01. The first patch implements the basic algorithm. TLP's goal is to reduce tail latency of short transactions. It achieves this by converting retransmission timeouts (RTOs) occuring due to tail losses (losses at end of transactions) into fast recovery. TLP transmits one packet in two round-trips when a connection is in Open state and isn't receiving any ACKs. The transmitted packet, aka loss probe, can be either new or a retransmission. When there is tail loss, the ACK from a loss probe triggers FACK/early-retransmit based fast recovery, thus avoiding a costly RTO. In the absence of loss, there is no change in the connection state. PTO stands for probe timeout. It is a timer event indicating that an ACK is overdue and triggers a loss probe packet. The PTO value is set to max(2*SRTT, 10ms) and is adjusted to account for delayed ACK timer when there is only one oustanding packet. TLP Algorithm On transmission of new data in Open state: -> packets_out > 1: schedule PTO in max(2*SRTT, 10ms). -> packets_out == 1: schedule PTO in max(2*RTT, 1.5*RTT + 200ms) -> PTO = min(PTO, RTO) Conditions for scheduling PTO: -> Connection is in Open state. -> Connection is either cwnd limited or no new data to send. -> Number of probes per tail loss episode is limited to one. -> Connection is SACK enabled. When PTO fires: new_segment_exists: -> transmit new segment. -> packets_out++. cwnd remains same. no_new_packet: -> retransmit the last segment. Its ACK triggers FACK or early retransmit based recovery. ACK path: -> rearm RTO at start of ACK processing. -> reschedule PTO if need be. In addition, the patch includes a small variation to the Early Retransmit (ER) algorithm, such that ER and TLP together can in principle recover any N-degree of tail loss through fast recovery. TLP is controlled by the same sysctl as ER, tcp_early_retrans sysctl. tcp_early_retrans==0; disables TLP and ER. ==1; enables RFC5827 ER. ==2; delayed ER. ==3; TLP and delayed ER. [DEFAULT] ==4; TLP only. The TLP patch series have been extensively tested on Google Web servers. It is most effective for short Web trasactions, where it reduced RTOs by 15% and improved HTTP response time (average by 6%, 99th percentile by 10%). The transmitted probes account for <0.5% of the overall transmissions. Signed-off-by: Nandita Dukkipati <nanditad@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Acked-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Nandita Dukkipati <nanditad@google.com> 2013-03-11 06:00:43 -0400
committer: David S. Miller <davem@davemloft.net> 2013-03-12 08:30:34 -0400
commit: 6ba8a3b19e764b6a65e4030ab0999be50c291e6c (patch)
tree: 57ba4b6411762d1124a3e08577e32e86769c024f /net/ipv4/tcp_output.c
parent: 83e519b63480e691d43ee106547b10941bfa0232 (diff)
1 files changed, 124 insertions, 4 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e2b4461074da..beb63dbc85f5 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -74,6 +74,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 /* Account for new data that has been sent to the network. */
 static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
 {
+        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        unsigned int prior_packets = tp->packets_out;
@@ -85,7 +86,8 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
                tp->frto_counter = 3;
        tp->packets_out += tcp_skb_pcount(skb);
-        if (!prior_packets || tp->early_retrans_delayed)
+        if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+            icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
                tcp_rearm_rto(sk);
 }
@@ -1959,6 +1961,9 @@ static int tcp_mtu_probe(struct sock *sk)
 * snd_up-64k-mss .. snd_up cannot be large. However, taking into
 * account rare use of URG, this is not a big flaw.
 *
+ * Send at most one packet when push_one > 0. Temporarily ignore
+ * cwnd limit to force at most one packet out when push_one == 2.
 * Returns true, if no segments are in flight and we have queued segments,
 * but cannot send anything now because of SWS or another problem.
 */
@@ -1994,8 +1999,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
                        goto repair; /* Skip network transmission */
                cwnd_quota = tcp_cwnd_test(tp, skb);
-                if (!cwnd_quota)
+                if (!cwnd_quota) {
-                        break;
+                        if (push_one == 2)
+                                /* Force out a loss probe pkt. */
+                                cwnd_quota = 1;
+                        else
+                                break;
+                }
                if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
                        break;
@@ -2049,10 +2059,120 @@ repair:
        if (likely(sent_pkts)) {
                if (tcp_in_cwnd_reduction(sk))
                        tp->prr_out += sent_pkts;
+                /* Send one loss probe per tail loss episode. */
+                if (push_one != 2)
+                        tcp_schedule_loss_probe(sk);
                tcp_cwnd_validate(sk);
                return false;
        }
-        return !tp->packets_out && tcp_send_head(sk);
+        return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));
+}
+bool tcp_schedule_loss_probe(struct sock *sk)
+{
+        struct inet_connection_sock *icsk = inet_csk(sk);
+        struct tcp_sock *tp = tcp_sk(sk);
+        u32 timeout, tlp_time_stamp, rto_time_stamp;
+        u32 rtt = tp->srtt >> 3;
+        if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
+                return false;
+        /* No consecutive loss probes. */
+        if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
+                tcp_rearm_rto(sk);
+                return false;
+        }
+        /* Don't do any loss probe on a Fast Open connection before 3WHS
+         * finishes.
+         */
+        if (sk->sk_state == TCP_SYN_RECV)
+                return false;
+        /* TLP is only scheduled when next timer event is RTO. */
+        if (icsk->icsk_pending != ICSK_TIME_RETRANS)
+                return false;
+        /* Schedule a loss probe in 2*RTT for SACK capable connections
+         * in Open state, that are either limited by cwnd or application.
+         */
+        if (sysctl_tcp_early_retrans < 3 || !rtt || !tp->packets_out ||
+            !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
+                return false;
+        if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
+             tcp_send_head(sk))
+                return false;
+        /* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account
+         * for delayed ack when there's one outstanding packet.
+         */
+        timeout = rtt << 1;
+        if (tp->packets_out == 1)
+                timeout = max_t(u32, timeout,
+                                (rtt + (rtt >> 1) + TCP_DELACK_MAX));
+        timeout = max_t(u32, timeout, msecs_to_jiffies(10));
+        /* If RTO is shorter, just schedule TLP in its place. */
+        tlp_time_stamp = tcp_time_stamp + timeout;
+        rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout;
+        if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) {
+                s32 delta = rto_time_stamp - tcp_time_stamp;
+                if (delta > 0)
+                        timeout = delta;
+        }
+        inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
+                                  TCP_RTO_MAX);
+        return true;
+}
+/* When probe timeout (PTO) fires, send a new segment if one exists, else
+ * retransmit the last segment.
+ */
+void tcp_send_loss_probe(struct sock *sk)
+{
+        struct sk_buff *skb;
+        int pcount;
+        int mss = tcp_current_mss(sk);
+        int err = -1;
+        if (tcp_send_head(sk) != NULL) {
+                err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
+                goto rearm_timer;
+        }
+        /* Retransmit last segment. */
+        skb = tcp_write_queue_tail(sk);
+        if (WARN_ON(!skb))
+                goto rearm_timer;
+        pcount = tcp_skb_pcount(skb);
+        if (WARN_ON(!pcount))
+                goto rearm_timer;
+        if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
+                if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss)))
+                        goto rearm_timer;
+                skb = tcp_write_queue_tail(sk);
+        }
+        if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
+                goto rearm_timer;
+        /* Probe with zero data doesn't trigger fast recovery. */
+        if (skb->len > 0)
+                err = __tcp_retransmit_skb(sk, skb);
+rearm_timer:
+        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+                                  inet_csk(sk)->icsk_rto,
+                                  TCP_RTO_MAX);
+        if (likely(!err))
+                NET_INC_STATS_BH(sock_net(sk),
+                                 LINUX_MIB_TCPLOSSPROBES);
+        return;
 }
 /* Push out any pending frames which were held back due to
author	Nandita Dukkipati <nanditad@google.com>	2013-03-11 06:00:43 -0400
committer	David S. Miller <davem@davemloft.net>	2013-03-12 08:30:34 -0400
commit	6ba8a3b19e764b6a65e4030ab0999be50c291e6c (patch)
tree	57ba4b6411762d1124a3e08577e32e86769c024f /net/ipv4/tcp_output.c
parent	83e519b63480e691d43ee106547b10941bfa0232 (diff)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e2b4461074da..beb63dbc85f5 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c
@@ -74,6 +74,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
74	/* Account for new data that has been sent to the network. */	74	/* Account for new data that has been sent to the network. */
75	static void tcp_event_new_data_sent(struct sock sk, const struct sk_buff skb)	75	static void tcp_event_new_data_sent(struct sock sk, const struct sk_buff skb)
76	{	76	{
		77	struct inet_connection_sock *icsk = inet_csk(sk);
77	struct tcp_sock *tp = tcp_sk(sk);	78	struct tcp_sock *tp = tcp_sk(sk);
78	unsigned int prior_packets = tp->packets_out;	79	unsigned int prior_packets = tp->packets_out;
79		80
@@ -85,7 +86,8 @@ static void tcp_event_new_data_sent(struct sock sk, const struct sk_buff skb)
85	tp->frto_counter = 3;	86	tp->frto_counter = 3;
86		87
87	tp->packets_out += tcp_skb_pcount(skb);	88	tp->packets_out += tcp_skb_pcount(skb);
88	if (!prior_packets \|\| tp->early_retrans_delayed)	89	if (!prior_packets \|\| icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS \|\|
		90	icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
89	tcp_rearm_rto(sk);	91	tcp_rearm_rto(sk);
90	}	92	}
91		93
@@ -1959,6 +1961,9 @@ static int tcp_mtu_probe(struct sock *sk)
1959	* snd_up-64k-mss .. snd_up cannot be large. However, taking into	1961	* snd_up-64k-mss .. snd_up cannot be large. However, taking into
1960	* account rare use of URG, this is not a big flaw.	1962	* account rare use of URG, this is not a big flaw.
1961	*	1963	*
		1964	* Send at most one packet when push_one > 0. Temporarily ignore
		1965	* cwnd limit to force at most one packet out when push_one == 2.
		1966
1962	* Returns true, if no segments are in flight and we have queued segments,	1967	* Returns true, if no segments are in flight and we have queued segments,
1963	* but cannot send anything now because of SWS or another problem.	1968	* but cannot send anything now because of SWS or another problem.
1964	*/	1969	*/
@@ -1994,8 +1999,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1994	goto repair; /* Skip network transmission */	1999	goto repair; /* Skip network transmission */
1995		2000
1996	cwnd_quota = tcp_cwnd_test(tp, skb);	2001	cwnd_quota = tcp_cwnd_test(tp, skb);
1997	if (!cwnd_quota)	2002	if (!cwnd_quota) {
1998	break;	2003	if (push_one == 2)
		2004	/* Force out a loss probe pkt. */
		2005	cwnd_quota = 1;
		2006	else
		2007	break;
		2008	}
1999		2009
2000	if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))	2010	if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
2001	break;	2011	break;
@@ -2049,10 +2059,120 @@ repair:
2049	if (likely(sent_pkts)) {	2059	if (likely(sent_pkts)) {
2050	if (tcp_in_cwnd_reduction(sk))	2060	if (tcp_in_cwnd_reduction(sk))
2051	tp->prr_out += sent_pkts;	2061	tp->prr_out += sent_pkts;
		2062
		2063	/* Send one loss probe per tail loss episode. */
		2064	if (push_one != 2)
		2065	tcp_schedule_loss_probe(sk);
2052	tcp_cwnd_validate(sk);	2066	tcp_cwnd_validate(sk);
2053	return false;	2067	return false;
2054	}	2068	}
2055	return !tp->packets_out && tcp_send_head(sk);	2069	return (push_one == 2) \|\| (!tp->packets_out && tcp_send_head(sk));
		2070	}
		2071
		2072	bool tcp_schedule_loss_probe(struct sock *sk)
		2073	{
		2074	struct inet_connection_sock *icsk = inet_csk(sk);
		2075	struct tcp_sock *tp = tcp_sk(sk);
		2076	u32 timeout, tlp_time_stamp, rto_time_stamp;
		2077	u32 rtt = tp->srtt >> 3;
		2078
		2079	if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
		2080	return false;
		2081	/* No consecutive loss probes. */
		2082	if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
		2083	tcp_rearm_rto(sk);
		2084	return false;
		2085	}
		2086	/* Don't do any loss probe on a Fast Open connection before 3WHS
		2087	* finishes.
		2088	*/
		2089	if (sk->sk_state == TCP_SYN_RECV)
		2090	return false;
		2091
		2092	/* TLP is only scheduled when next timer event is RTO. */
		2093	if (icsk->icsk_pending != ICSK_TIME_RETRANS)
		2094	return false;
		2095
		2096	/* Schedule a loss probe in 2*RTT for SACK capable connections
		2097	* in Open state, that are either limited by cwnd or application.
		2098	*/
		2099	if (sysctl_tcp_early_retrans < 3 \|\| !rtt \|\| !tp->packets_out \|\|
		2100	!tcp_is_sack(tp) \|\| inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
		2101	return false;
		2102
		2103	if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
		2104	tcp_send_head(sk))
		2105	return false;
		2106
		2107	/* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account
		2108	* for delayed ack when there's one outstanding packet.
		2109	*/
		2110	timeout = rtt << 1;
		2111	if (tp->packets_out == 1)
		2112	timeout = max_t(u32, timeout,
		2113	(rtt + (rtt >> 1) + TCP_DELACK_MAX));
		2114	timeout = max_t(u32, timeout, msecs_to_jiffies(10));
		2115
		2116	/* If RTO is shorter, just schedule TLP in its place. */
		2117	tlp_time_stamp = tcp_time_stamp + timeout;
		2118	rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout;
		2119	if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) {
		2120	s32 delta = rto_time_stamp - tcp_time_stamp;
		2121	if (delta > 0)
		2122	timeout = delta;
		2123	}
		2124
		2125	inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
		2126	TCP_RTO_MAX);
		2127	return true;
		2128	}
		2129
		2130	/* When probe timeout (PTO) fires, send a new segment if one exists, else
		2131	* retransmit the last segment.
		2132	*/
		2133	void tcp_send_loss_probe(struct sock *sk)
		2134	{
		2135	struct sk_buff *skb;
		2136	int pcount;
		2137	int mss = tcp_current_mss(sk);
		2138	int err = -1;
		2139
		2140	if (tcp_send_head(sk) != NULL) {
		2141	err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
		2142	goto rearm_timer;
		2143	}
		2144
		2145	/* Retransmit last segment. */
		2146	skb = tcp_write_queue_tail(sk);
		2147	if (WARN_ON(!skb))
		2148	goto rearm_timer;
		2149
		2150	pcount = tcp_skb_pcount(skb);
		2151	if (WARN_ON(!pcount))
		2152	goto rearm_timer;
		2153
		2154	if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
		2155	if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss)))
		2156	goto rearm_timer;
		2157	skb = tcp_write_queue_tail(sk);
		2158	}
		2159
		2160	if (WARN_ON(!skb \|\| !tcp_skb_pcount(skb)))
		2161	goto rearm_timer;
		2162
		2163	/* Probe with zero data doesn't trigger fast recovery. */
		2164	if (skb->len > 0)
		2165	err = __tcp_retransmit_skb(sk, skb);
		2166
		2167	rearm_timer:
		2168	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
		2169	inet_csk(sk)->icsk_rto,
		2170	TCP_RTO_MAX);
		2171
		2172	if (likely(!err))
		2173	NET_INC_STATS_BH(sock_net(sk),
		2174	LINUX_MIB_TCPLOSSPROBES);
		2175	return;
2056	}	2176	}
2057		2177
2058	/* Push out any pending frames which were held back due to	2178	/* Push out any pending frames which were held back due to