diff options
author | Nandita Dukkipati <nanditad@google.com> | 2013-03-11 06:00:43 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2013-03-12 08:30:34 -0400 |
commit | 6ba8a3b19e764b6a65e4030ab0999be50c291e6c (patch) | |
tree | 57ba4b6411762d1124a3e08577e32e86769c024f /net/ipv4/tcp_input.c | |
parent | 83e519b63480e691d43ee106547b10941bfa0232 (diff) |
tcp: Tail loss probe (TLP)
This patch series implement the Tail loss probe (TLP) algorithm described
in http://tools.ietf.org/html/draft-dukkipati-tcpm-tcp-loss-probe-01. The
first patch implements the basic algorithm.
TLP's goal is to reduce tail latency of short transactions. It achieves
this by converting retransmission timeouts (RTOs) occuring due
to tail losses (losses at end of transactions) into fast recovery.
TLP transmits one packet in two round-trips when a connection is in
Open state and isn't receiving any ACKs. The transmitted packet, aka
loss probe, can be either new or a retransmission. When there is tail
loss, the ACK from a loss probe triggers FACK/early-retransmit based
fast recovery, thus avoiding a costly RTO. In the absence of loss,
there is no change in the connection state.
PTO stands for probe timeout. It is a timer event indicating
that an ACK is overdue and triggers a loss probe packet. The PTO value
is set to max(2*SRTT, 10ms) and is adjusted to account for delayed
ACK timer when there is only one oustanding packet.
TLP Algorithm
On transmission of new data in Open state:
-> packets_out > 1: schedule PTO in max(2*SRTT, 10ms).
-> packets_out == 1: schedule PTO in max(2*RTT, 1.5*RTT + 200ms)
-> PTO = min(PTO, RTO)
Conditions for scheduling PTO:
-> Connection is in Open state.
-> Connection is either cwnd limited or no new data to send.
-> Number of probes per tail loss episode is limited to one.
-> Connection is SACK enabled.
When PTO fires:
new_segment_exists:
-> transmit new segment.
-> packets_out++. cwnd remains same.
no_new_packet:
-> retransmit the last segment.
Its ACK triggers FACK or early retransmit based recovery.
ACK path:
-> rearm RTO at start of ACK processing.
-> reschedule PTO if need be.
In addition, the patch includes a small variation to the Early Retransmit
(ER) algorithm, such that ER and TLP together can in principle recover any
N-degree of tail loss through fast recovery. TLP is controlled by the same
sysctl as ER, tcp_early_retrans sysctl.
tcp_early_retrans==0; disables TLP and ER.
==1; enables RFC5827 ER.
==2; delayed ER.
==3; TLP and delayed ER. [DEFAULT]
==4; TLP only.
The TLP patch series have been extensively tested on Google Web servers.
It is most effective for short Web trasactions, where it reduced RTOs by 15%
and improved HTTP response time (average by 6%, 99th percentile by 10%).
The transmitted probes account for <0.5% of the overall transmissions.
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r-- | net/ipv4/tcp_input.c | 24 |
1 files changed, 15 insertions, 9 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0d9bdacce99f..b794f89ac1f2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -98,7 +98,7 @@ int sysctl_tcp_frto_response __read_mostly; | |||
98 | int sysctl_tcp_thin_dupack __read_mostly; | 98 | int sysctl_tcp_thin_dupack __read_mostly; |
99 | 99 | ||
100 | int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; | 100 | int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; |
101 | int sysctl_tcp_early_retrans __read_mostly = 2; | 101 | int sysctl_tcp_early_retrans __read_mostly = 3; |
102 | 102 | ||
103 | #define FLAG_DATA 0x01 /* Incoming frame contained data. */ | 103 | #define FLAG_DATA 0x01 /* Incoming frame contained data. */ |
104 | #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ | 104 | #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ |
@@ -2150,15 +2150,16 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) | |||
2150 | * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples | 2150 | * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples |
2151 | * available, or RTO is scheduled to fire first. | 2151 | * available, or RTO is scheduled to fire first. |
2152 | */ | 2152 | */ |
2153 | if (sysctl_tcp_early_retrans < 2 || (flag & FLAG_ECE) || !tp->srtt) | 2153 | if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 || |
2154 | (flag & FLAG_ECE) || !tp->srtt) | ||
2154 | return false; | 2155 | return false; |
2155 | 2156 | ||
2156 | delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2)); | 2157 | delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2)); |
2157 | if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) | 2158 | if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) |
2158 | return false; | 2159 | return false; |
2159 | 2160 | ||
2160 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, TCP_RTO_MAX); | 2161 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay, |
2161 | tp->early_retrans_delayed = 1; | 2162 | TCP_RTO_MAX); |
2162 | return true; | 2163 | return true; |
2163 | } | 2164 | } |
2164 | 2165 | ||
@@ -2321,7 +2322,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) | |||
2321 | * interval if appropriate. | 2322 | * interval if appropriate. |
2322 | */ | 2323 | */ |
2323 | if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out && | 2324 | if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out && |
2324 | (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) && | 2325 | (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) && |
2325 | !tcp_may_send_now(sk)) | 2326 | !tcp_may_send_now(sk)) |
2326 | return !tcp_pause_early_retransmit(sk, flag); | 2327 | return !tcp_pause_early_retransmit(sk, flag); |
2327 | 2328 | ||
@@ -3081,6 +3082,7 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) | |||
3081 | */ | 3082 | */ |
3082 | void tcp_rearm_rto(struct sock *sk) | 3083 | void tcp_rearm_rto(struct sock *sk) |
3083 | { | 3084 | { |
3085 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
3084 | struct tcp_sock *tp = tcp_sk(sk); | 3086 | struct tcp_sock *tp = tcp_sk(sk); |
3085 | 3087 | ||
3086 | /* If the retrans timer is currently being used by Fast Open | 3088 | /* If the retrans timer is currently being used by Fast Open |
@@ -3094,12 +3096,13 @@ void tcp_rearm_rto(struct sock *sk) | |||
3094 | } else { | 3096 | } else { |
3095 | u32 rto = inet_csk(sk)->icsk_rto; | 3097 | u32 rto = inet_csk(sk)->icsk_rto; |
3096 | /* Offset the time elapsed after installing regular RTO */ | 3098 | /* Offset the time elapsed after installing regular RTO */ |
3097 | if (tp->early_retrans_delayed) { | 3099 | if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || |
3100 | icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { | ||
3098 | struct sk_buff *skb = tcp_write_queue_head(sk); | 3101 | struct sk_buff *skb = tcp_write_queue_head(sk); |
3099 | const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto; | 3102 | const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto; |
3100 | s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); | 3103 | s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); |
3101 | /* delta may not be positive if the socket is locked | 3104 | /* delta may not be positive if the socket is locked |
3102 | * when the delayed ER timer fires and is rescheduled. | 3105 | * when the retrans timer fires and is rescheduled. |
3103 | */ | 3106 | */ |
3104 | if (delta > 0) | 3107 | if (delta > 0) |
3105 | rto = delta; | 3108 | rto = delta; |
@@ -3107,7 +3110,6 @@ void tcp_rearm_rto(struct sock *sk) | |||
3107 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, | 3110 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, |
3108 | TCP_RTO_MAX); | 3111 | TCP_RTO_MAX); |
3109 | } | 3112 | } |
3110 | tp->early_retrans_delayed = 0; | ||
3111 | } | 3113 | } |
3112 | 3114 | ||
3113 | /* This function is called when the delayed ER timer fires. TCP enters | 3115 | /* This function is called when the delayed ER timer fires. TCP enters |
@@ -3601,7 +3603,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3601 | if (after(ack, tp->snd_nxt)) | 3603 | if (after(ack, tp->snd_nxt)) |
3602 | goto invalid_ack; | 3604 | goto invalid_ack; |
3603 | 3605 | ||
3604 | if (tp->early_retrans_delayed) | 3606 | if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || |
3607 | icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) | ||
3605 | tcp_rearm_rto(sk); | 3608 | tcp_rearm_rto(sk); |
3606 | 3609 | ||
3607 | if (after(ack, prior_snd_una)) | 3610 | if (after(ack, prior_snd_una)) |
@@ -3678,6 +3681,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3678 | if (dst) | 3681 | if (dst) |
3679 | dst_confirm(dst); | 3682 | dst_confirm(dst); |
3680 | } | 3683 | } |
3684 | |||
3685 | if (icsk->icsk_pending == ICSK_TIME_RETRANS) | ||
3686 | tcp_schedule_loss_probe(sk); | ||
3681 | return 1; | 3687 | return 1; |
3682 | 3688 | ||
3683 | no_queue: | 3689 | no_queue: |