aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_output.c
diff options
context:
space:
mode:
authorNandita Dukkipati <nanditad@google.com>2013-03-11 06:00:43 -0400
committerDavid S. Miller <davem@davemloft.net>2013-03-12 08:30:34 -0400
commit6ba8a3b19e764b6a65e4030ab0999be50c291e6c (patch)
tree57ba4b6411762d1124a3e08577e32e86769c024f /net/ipv4/tcp_output.c
parent83e519b63480e691d43ee106547b10941bfa0232 (diff)
tcp: Tail loss probe (TLP)
This patch series implement the Tail loss probe (TLP) algorithm described in http://tools.ietf.org/html/draft-dukkipati-tcpm-tcp-loss-probe-01. The first patch implements the basic algorithm. TLP's goal is to reduce tail latency of short transactions. It achieves this by converting retransmission timeouts (RTOs) occuring due to tail losses (losses at end of transactions) into fast recovery. TLP transmits one packet in two round-trips when a connection is in Open state and isn't receiving any ACKs. The transmitted packet, aka loss probe, can be either new or a retransmission. When there is tail loss, the ACK from a loss probe triggers FACK/early-retransmit based fast recovery, thus avoiding a costly RTO. In the absence of loss, there is no change in the connection state. PTO stands for probe timeout. It is a timer event indicating that an ACK is overdue and triggers a loss probe packet. The PTO value is set to max(2*SRTT, 10ms) and is adjusted to account for delayed ACK timer when there is only one oustanding packet. TLP Algorithm On transmission of new data in Open state: -> packets_out > 1: schedule PTO in max(2*SRTT, 10ms). -> packets_out == 1: schedule PTO in max(2*RTT, 1.5*RTT + 200ms) -> PTO = min(PTO, RTO) Conditions for scheduling PTO: -> Connection is in Open state. -> Connection is either cwnd limited or no new data to send. -> Number of probes per tail loss episode is limited to one. -> Connection is SACK enabled. When PTO fires: new_segment_exists: -> transmit new segment. -> packets_out++. cwnd remains same. no_new_packet: -> retransmit the last segment. Its ACK triggers FACK or early retransmit based recovery. ACK path: -> rearm RTO at start of ACK processing. -> reschedule PTO if need be. In addition, the patch includes a small variation to the Early Retransmit (ER) algorithm, such that ER and TLP together can in principle recover any N-degree of tail loss through fast recovery. TLP is controlled by the same sysctl as ER, tcp_early_retrans sysctl. tcp_early_retrans==0; disables TLP and ER. ==1; enables RFC5827 ER. ==2; delayed ER. ==3; TLP and delayed ER. [DEFAULT] ==4; TLP only. The TLP patch series have been extensively tested on Google Web servers. It is most effective for short Web trasactions, where it reduced RTOs by 15% and improved HTTP response time (average by 6%, 99th percentile by 10%). The transmitted probes account for <0.5% of the overall transmissions. Signed-off-by: Nandita Dukkipati <nanditad@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Acked-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r--net/ipv4/tcp_output.c128
1 files changed, 124 insertions, 4 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e2b4461074da..beb63dbc85f5 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -74,6 +74,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
74/* Account for new data that has been sent to the network. */ 74/* Account for new data that has been sent to the network. */
75static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) 75static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
76{ 76{
77 struct inet_connection_sock *icsk = inet_csk(sk);
77 struct tcp_sock *tp = tcp_sk(sk); 78 struct tcp_sock *tp = tcp_sk(sk);
78 unsigned int prior_packets = tp->packets_out; 79 unsigned int prior_packets = tp->packets_out;
79 80
@@ -85,7 +86,8 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
85 tp->frto_counter = 3; 86 tp->frto_counter = 3;
86 87
87 tp->packets_out += tcp_skb_pcount(skb); 88 tp->packets_out += tcp_skb_pcount(skb);
88 if (!prior_packets || tp->early_retrans_delayed) 89 if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
90 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
89 tcp_rearm_rto(sk); 91 tcp_rearm_rto(sk);
90} 92}
91 93
@@ -1959,6 +1961,9 @@ static int tcp_mtu_probe(struct sock *sk)
1959 * snd_up-64k-mss .. snd_up cannot be large. However, taking into 1961 * snd_up-64k-mss .. snd_up cannot be large. However, taking into
1960 * account rare use of URG, this is not a big flaw. 1962 * account rare use of URG, this is not a big flaw.
1961 * 1963 *
1964 * Send at most one packet when push_one > 0. Temporarily ignore
1965 * cwnd limit to force at most one packet out when push_one == 2.
1966
1962 * Returns true, if no segments are in flight and we have queued segments, 1967 * Returns true, if no segments are in flight and we have queued segments,
1963 * but cannot send anything now because of SWS or another problem. 1968 * but cannot send anything now because of SWS or another problem.
1964 */ 1969 */
@@ -1994,8 +1999,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1994 goto repair; /* Skip network transmission */ 1999 goto repair; /* Skip network transmission */
1995 2000
1996 cwnd_quota = tcp_cwnd_test(tp, skb); 2001 cwnd_quota = tcp_cwnd_test(tp, skb);
1997 if (!cwnd_quota) 2002 if (!cwnd_quota) {
1998 break; 2003 if (push_one == 2)
2004 /* Force out a loss probe pkt. */
2005 cwnd_quota = 1;
2006 else
2007 break;
2008 }
1999 2009
2000 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) 2010 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
2001 break; 2011 break;
@@ -2049,10 +2059,120 @@ repair:
2049 if (likely(sent_pkts)) { 2059 if (likely(sent_pkts)) {
2050 if (tcp_in_cwnd_reduction(sk)) 2060 if (tcp_in_cwnd_reduction(sk))
2051 tp->prr_out += sent_pkts; 2061 tp->prr_out += sent_pkts;
2062
2063 /* Send one loss probe per tail loss episode. */
2064 if (push_one != 2)
2065 tcp_schedule_loss_probe(sk);
2052 tcp_cwnd_validate(sk); 2066 tcp_cwnd_validate(sk);
2053 return false; 2067 return false;
2054 } 2068 }
2055 return !tp->packets_out && tcp_send_head(sk); 2069 return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));
2070}
2071
2072bool tcp_schedule_loss_probe(struct sock *sk)
2073{
2074 struct inet_connection_sock *icsk = inet_csk(sk);
2075 struct tcp_sock *tp = tcp_sk(sk);
2076 u32 timeout, tlp_time_stamp, rto_time_stamp;
2077 u32 rtt = tp->srtt >> 3;
2078
2079 if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
2080 return false;
2081 /* No consecutive loss probes. */
2082 if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
2083 tcp_rearm_rto(sk);
2084 return false;
2085 }
2086 /* Don't do any loss probe on a Fast Open connection before 3WHS
2087 * finishes.
2088 */
2089 if (sk->sk_state == TCP_SYN_RECV)
2090 return false;
2091
2092 /* TLP is only scheduled when next timer event is RTO. */
2093 if (icsk->icsk_pending != ICSK_TIME_RETRANS)
2094 return false;
2095
2096 /* Schedule a loss probe in 2*RTT for SACK capable connections
2097 * in Open state, that are either limited by cwnd or application.
2098 */
2099 if (sysctl_tcp_early_retrans < 3 || !rtt || !tp->packets_out ||
2100 !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
2101 return false;
2102
2103 if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
2104 tcp_send_head(sk))
2105 return false;
2106
2107 /* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account
2108 * for delayed ack when there's one outstanding packet.
2109 */
2110 timeout = rtt << 1;
2111 if (tp->packets_out == 1)
2112 timeout = max_t(u32, timeout,
2113 (rtt + (rtt >> 1) + TCP_DELACK_MAX));
2114 timeout = max_t(u32, timeout, msecs_to_jiffies(10));
2115
2116 /* If RTO is shorter, just schedule TLP in its place. */
2117 tlp_time_stamp = tcp_time_stamp + timeout;
2118 rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout;
2119 if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) {
2120 s32 delta = rto_time_stamp - tcp_time_stamp;
2121 if (delta > 0)
2122 timeout = delta;
2123 }
2124
2125 inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
2126 TCP_RTO_MAX);
2127 return true;
2128}
2129
2130/* When probe timeout (PTO) fires, send a new segment if one exists, else
2131 * retransmit the last segment.
2132 */
2133void tcp_send_loss_probe(struct sock *sk)
2134{
2135 struct sk_buff *skb;
2136 int pcount;
2137 int mss = tcp_current_mss(sk);
2138 int err = -1;
2139
2140 if (tcp_send_head(sk) != NULL) {
2141 err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
2142 goto rearm_timer;
2143 }
2144
2145 /* Retransmit last segment. */
2146 skb = tcp_write_queue_tail(sk);
2147 if (WARN_ON(!skb))
2148 goto rearm_timer;
2149
2150 pcount = tcp_skb_pcount(skb);
2151 if (WARN_ON(!pcount))
2152 goto rearm_timer;
2153
2154 if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
2155 if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss)))
2156 goto rearm_timer;
2157 skb = tcp_write_queue_tail(sk);
2158 }
2159
2160 if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
2161 goto rearm_timer;
2162
2163 /* Probe with zero data doesn't trigger fast recovery. */
2164 if (skb->len > 0)
2165 err = __tcp_retransmit_skb(sk, skb);
2166
2167rearm_timer:
2168 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2169 inet_csk(sk)->icsk_rto,
2170 TCP_RTO_MAX);
2171
2172 if (likely(!err))
2173 NET_INC_STATS_BH(sock_net(sk),
2174 LINUX_MIB_TCPLOSSPROBES);
2175 return;
2056} 2176}
2057 2177
2058/* Push out any pending frames which were held back due to 2178/* Push out any pending frames which were held back due to