aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
authorNandita Dukkipati <nanditad@google.com>2013-03-11 06:00:43 -0400
committerDavid S. Miller <davem@davemloft.net>2013-03-12 08:30:34 -0400
commit6ba8a3b19e764b6a65e4030ab0999be50c291e6c (patch)
tree57ba4b6411762d1124a3e08577e32e86769c024f /net/ipv4/tcp_input.c
parent83e519b63480e691d43ee106547b10941bfa0232 (diff)
tcp: Tail loss probe (TLP)
This patch series implement the Tail loss probe (TLP) algorithm described in http://tools.ietf.org/html/draft-dukkipati-tcpm-tcp-loss-probe-01. The first patch implements the basic algorithm. TLP's goal is to reduce tail latency of short transactions. It achieves this by converting retransmission timeouts (RTOs) occuring due to tail losses (losses at end of transactions) into fast recovery. TLP transmits one packet in two round-trips when a connection is in Open state and isn't receiving any ACKs. The transmitted packet, aka loss probe, can be either new or a retransmission. When there is tail loss, the ACK from a loss probe triggers FACK/early-retransmit based fast recovery, thus avoiding a costly RTO. In the absence of loss, there is no change in the connection state. PTO stands for probe timeout. It is a timer event indicating that an ACK is overdue and triggers a loss probe packet. The PTO value is set to max(2*SRTT, 10ms) and is adjusted to account for delayed ACK timer when there is only one oustanding packet. TLP Algorithm On transmission of new data in Open state: -> packets_out > 1: schedule PTO in max(2*SRTT, 10ms). -> packets_out == 1: schedule PTO in max(2*RTT, 1.5*RTT + 200ms) -> PTO = min(PTO, RTO) Conditions for scheduling PTO: -> Connection is in Open state. -> Connection is either cwnd limited or no new data to send. -> Number of probes per tail loss episode is limited to one. -> Connection is SACK enabled. When PTO fires: new_segment_exists: -> transmit new segment. -> packets_out++. cwnd remains same. no_new_packet: -> retransmit the last segment. Its ACK triggers FACK or early retransmit based recovery. ACK path: -> rearm RTO at start of ACK processing. -> reschedule PTO if need be. In addition, the patch includes a small variation to the Early Retransmit (ER) algorithm, such that ER and TLP together can in principle recover any N-degree of tail loss through fast recovery. TLP is controlled by the same sysctl as ER, tcp_early_retrans sysctl. tcp_early_retrans==0; disables TLP and ER. ==1; enables RFC5827 ER. ==2; delayed ER. ==3; TLP and delayed ER. [DEFAULT] ==4; TLP only. The TLP patch series have been extensively tested on Google Web servers. It is most effective for short Web trasactions, where it reduced RTOs by 15% and improved HTTP response time (average by 6%, 99th percentile by 10%). The transmitted probes account for <0.5% of the overall transmissions. Signed-off-by: Nandita Dukkipati <nanditad@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Acked-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c24
1 files changed, 15 insertions, 9 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 0d9bdacce99f..b794f89ac1f2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -98,7 +98,7 @@ int sysctl_tcp_frto_response __read_mostly;
98int sysctl_tcp_thin_dupack __read_mostly; 98int sysctl_tcp_thin_dupack __read_mostly;
99 99
100int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; 100int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
101int sysctl_tcp_early_retrans __read_mostly = 2; 101int sysctl_tcp_early_retrans __read_mostly = 3;
102 102
103#define FLAG_DATA 0x01 /* Incoming frame contained data. */ 103#define FLAG_DATA 0x01 /* Incoming frame contained data. */
104#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ 104#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
@@ -2150,15 +2150,16 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
2150 * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples 2150 * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
2151 * available, or RTO is scheduled to fire first. 2151 * available, or RTO is scheduled to fire first.
2152 */ 2152 */
2153 if (sysctl_tcp_early_retrans < 2 || (flag & FLAG_ECE) || !tp->srtt) 2153 if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
2154 (flag & FLAG_ECE) || !tp->srtt)
2154 return false; 2155 return false;
2155 2156
2156 delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2)); 2157 delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2));
2157 if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) 2158 if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
2158 return false; 2159 return false;
2159 2160
2160 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, TCP_RTO_MAX); 2161 inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay,
2161 tp->early_retrans_delayed = 1; 2162 TCP_RTO_MAX);
2162 return true; 2163 return true;
2163} 2164}
2164 2165
@@ -2321,7 +2322,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
2321 * interval if appropriate. 2322 * interval if appropriate.
2322 */ 2323 */
2323 if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out && 2324 if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
2324 (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) && 2325 (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) &&
2325 !tcp_may_send_now(sk)) 2326 !tcp_may_send_now(sk))
2326 return !tcp_pause_early_retransmit(sk, flag); 2327 return !tcp_pause_early_retransmit(sk, flag);
2327 2328
@@ -3081,6 +3082,7 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
3081 */ 3082 */
3082void tcp_rearm_rto(struct sock *sk) 3083void tcp_rearm_rto(struct sock *sk)
3083{ 3084{
3085 const struct inet_connection_sock *icsk = inet_csk(sk);
3084 struct tcp_sock *tp = tcp_sk(sk); 3086 struct tcp_sock *tp = tcp_sk(sk);
3085 3087
3086 /* If the retrans timer is currently being used by Fast Open 3088 /* If the retrans timer is currently being used by Fast Open
@@ -3094,12 +3096,13 @@ void tcp_rearm_rto(struct sock *sk)
3094 } else { 3096 } else {
3095 u32 rto = inet_csk(sk)->icsk_rto; 3097 u32 rto = inet_csk(sk)->icsk_rto;
3096 /* Offset the time elapsed after installing regular RTO */ 3098 /* Offset the time elapsed after installing regular RTO */
3097 if (tp->early_retrans_delayed) { 3099 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
3100 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
3098 struct sk_buff *skb = tcp_write_queue_head(sk); 3101 struct sk_buff *skb = tcp_write_queue_head(sk);
3099 const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto; 3102 const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto;
3100 s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); 3103 s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
3101 /* delta may not be positive if the socket is locked 3104 /* delta may not be positive if the socket is locked
3102 * when the delayed ER timer fires and is rescheduled. 3105 * when the retrans timer fires and is rescheduled.
3103 */ 3106 */
3104 if (delta > 0) 3107 if (delta > 0)
3105 rto = delta; 3108 rto = delta;
@@ -3107,7 +3110,6 @@ void tcp_rearm_rto(struct sock *sk)
3107 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, 3110 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
3108 TCP_RTO_MAX); 3111 TCP_RTO_MAX);
3109 } 3112 }
3110 tp->early_retrans_delayed = 0;
3111} 3113}
3112 3114
3113/* This function is called when the delayed ER timer fires. TCP enters 3115/* This function is called when the delayed ER timer fires. TCP enters
@@ -3601,7 +3603,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3601 if (after(ack, tp->snd_nxt)) 3603 if (after(ack, tp->snd_nxt))
3602 goto invalid_ack; 3604 goto invalid_ack;
3603 3605
3604 if (tp->early_retrans_delayed) 3606 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
3607 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
3605 tcp_rearm_rto(sk); 3608 tcp_rearm_rto(sk);
3606 3609
3607 if (after(ack, prior_snd_una)) 3610 if (after(ack, prior_snd_una))
@@ -3678,6 +3681,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3678 if (dst) 3681 if (dst)
3679 dst_confirm(dst); 3682 dst_confirm(dst);
3680 } 3683 }
3684
3685 if (icsk->icsk_pending == ICSK_TIME_RETRANS)
3686 tcp_schedule_loss_probe(sk);
3681 return 1; 3687 return 1;
3682 3688
3683no_queue: 3689no_queue: