diff options
author | Nandita Dukkipati <nanditad@google.com> | 2013-03-11 06:00:43 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2013-03-12 08:30:34 -0400 |
commit | 6ba8a3b19e764b6a65e4030ab0999be50c291e6c (patch) | |
tree | 57ba4b6411762d1124a3e08577e32e86769c024f /include | |
parent | 83e519b63480e691d43ee106547b10941bfa0232 (diff) |
tcp: Tail loss probe (TLP)
This patch series implement the Tail loss probe (TLP) algorithm described
in http://tools.ietf.org/html/draft-dukkipati-tcpm-tcp-loss-probe-01. The
first patch implements the basic algorithm.
TLP's goal is to reduce tail latency of short transactions. It achieves
this by converting retransmission timeouts (RTOs) occuring due
to tail losses (losses at end of transactions) into fast recovery.
TLP transmits one packet in two round-trips when a connection is in
Open state and isn't receiving any ACKs. The transmitted packet, aka
loss probe, can be either new or a retransmission. When there is tail
loss, the ACK from a loss probe triggers FACK/early-retransmit based
fast recovery, thus avoiding a costly RTO. In the absence of loss,
there is no change in the connection state.
PTO stands for probe timeout. It is a timer event indicating
that an ACK is overdue and triggers a loss probe packet. The PTO value
is set to max(2*SRTT, 10ms) and is adjusted to account for delayed
ACK timer when there is only one oustanding packet.
TLP Algorithm
On transmission of new data in Open state:
-> packets_out > 1: schedule PTO in max(2*SRTT, 10ms).
-> packets_out == 1: schedule PTO in max(2*RTT, 1.5*RTT + 200ms)
-> PTO = min(PTO, RTO)
Conditions for scheduling PTO:
-> Connection is in Open state.
-> Connection is either cwnd limited or no new data to send.
-> Number of probes per tail loss episode is limited to one.
-> Connection is SACK enabled.
When PTO fires:
new_segment_exists:
-> transmit new segment.
-> packets_out++. cwnd remains same.
no_new_packet:
-> retransmit the last segment.
Its ACK triggers FACK or early retransmit based recovery.
ACK path:
-> rearm RTO at start of ACK processing.
-> reschedule PTO if need be.
In addition, the patch includes a small variation to the Early Retransmit
(ER) algorithm, such that ER and TLP together can in principle recover any
N-degree of tail loss through fast recovery. TLP is controlled by the same
sysctl as ER, tcp_early_retrans sysctl.
tcp_early_retrans==0; disables TLP and ER.
==1; enables RFC5827 ER.
==2; delayed ER.
==3; TLP and delayed ER. [DEFAULT]
==4; TLP only.
The TLP patch series have been extensively tested on Google Web servers.
It is most effective for short Web trasactions, where it reduced RTOs by 15%
and improved HTTP response time (average by 6%, 99th percentile by 10%).
The transmitted probes account for <0.5% of the overall transmissions.
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/tcp.h | 1 | ||||
-rw-r--r-- | include/net/inet_connection_sock.h | 5 | ||||
-rw-r--r-- | include/net/tcp.h | 6 | ||||
-rw-r--r-- | include/uapi/linux/snmp.h | 1 |
4 files changed, 9 insertions, 4 deletions
diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 515c3746b675..01860d74555c 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h | |||
@@ -201,7 +201,6 @@ struct tcp_sock { | |||
201 | unused : 1; | 201 | unused : 1; |
202 | u8 repair_queue; | 202 | u8 repair_queue; |
203 | u8 do_early_retrans:1,/* Enable RFC5827 early-retransmit */ | 203 | u8 do_early_retrans:1,/* Enable RFC5827 early-retransmit */ |
204 | early_retrans_delayed:1, /* Delayed ER timer installed */ | ||
205 | syn_data:1, /* SYN includes data */ | 204 | syn_data:1, /* SYN includes data */ |
206 | syn_fastopen:1, /* SYN includes Fast Open option */ | 205 | syn_fastopen:1, /* SYN includes Fast Open option */ |
207 | syn_data_acked:1;/* data in SYN is acked by SYN-ACK */ | 206 | syn_data_acked:1;/* data in SYN is acked by SYN-ACK */ |
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 183292722f6e..de2c78529afa 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h | |||
@@ -133,6 +133,8 @@ struct inet_connection_sock { | |||
133 | #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ | 133 | #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ |
134 | #define ICSK_TIME_DACK 2 /* Delayed ack timer */ | 134 | #define ICSK_TIME_DACK 2 /* Delayed ack timer */ |
135 | #define ICSK_TIME_PROBE0 3 /* Zero window probe timer */ | 135 | #define ICSK_TIME_PROBE0 3 /* Zero window probe timer */ |
136 | #define ICSK_TIME_EARLY_RETRANS 4 /* Early retransmit timer */ | ||
137 | #define ICSK_TIME_LOSS_PROBE 5 /* Tail loss probe timer */ | ||
136 | 138 | ||
137 | static inline struct inet_connection_sock *inet_csk(const struct sock *sk) | 139 | static inline struct inet_connection_sock *inet_csk(const struct sock *sk) |
138 | { | 140 | { |
@@ -222,7 +224,8 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, | |||
222 | when = max_when; | 224 | when = max_when; |
223 | } | 225 | } |
224 | 226 | ||
225 | if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) { | 227 | if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 || |
228 | what == ICSK_TIME_EARLY_RETRANS || what == ICSK_TIME_LOSS_PROBE) { | ||
226 | icsk->icsk_pending = what; | 229 | icsk->icsk_pending = what; |
227 | icsk->icsk_timeout = jiffies + when; | 230 | icsk->icsk_timeout = jiffies + when; |
228 | sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); | 231 | sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); |
diff --git a/include/net/tcp.h b/include/net/tcp.h index a2baa5e4ba31..ab9f947b118b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h | |||
@@ -543,6 +543,8 @@ extern bool tcp_syn_flood_action(struct sock *sk, | |||
543 | extern void tcp_push_one(struct sock *, unsigned int mss_now); | 543 | extern void tcp_push_one(struct sock *, unsigned int mss_now); |
544 | extern void tcp_send_ack(struct sock *sk); | 544 | extern void tcp_send_ack(struct sock *sk); |
545 | extern void tcp_send_delayed_ack(struct sock *sk); | 545 | extern void tcp_send_delayed_ack(struct sock *sk); |
546 | extern void tcp_send_loss_probe(struct sock *sk); | ||
547 | extern bool tcp_schedule_loss_probe(struct sock *sk); | ||
546 | 548 | ||
547 | /* tcp_input.c */ | 549 | /* tcp_input.c */ |
548 | extern void tcp_cwnd_application_limited(struct sock *sk); | 550 | extern void tcp_cwnd_application_limited(struct sock *sk); |
@@ -873,8 +875,8 @@ static inline void tcp_enable_fack(struct tcp_sock *tp) | |||
873 | static inline void tcp_enable_early_retrans(struct tcp_sock *tp) | 875 | static inline void tcp_enable_early_retrans(struct tcp_sock *tp) |
874 | { | 876 | { |
875 | tp->do_early_retrans = sysctl_tcp_early_retrans && | 877 | tp->do_early_retrans = sysctl_tcp_early_retrans && |
876 | !sysctl_tcp_thin_dupack && sysctl_tcp_reordering == 3; | 878 | sysctl_tcp_early_retrans < 4 && !sysctl_tcp_thin_dupack && |
877 | tp->early_retrans_delayed = 0; | 879 | sysctl_tcp_reordering == 3; |
878 | } | 880 | } |
879 | 881 | ||
880 | static inline void tcp_disable_early_retrans(struct tcp_sock *tp) | 882 | static inline void tcp_disable_early_retrans(struct tcp_sock *tp) |
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index b49eab89c9fd..290bed6b085f 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h | |||
@@ -202,6 +202,7 @@ enum | |||
202 | LINUX_MIB_TCPFORWARDRETRANS, /* TCPForwardRetrans */ | 202 | LINUX_MIB_TCPFORWARDRETRANS, /* TCPForwardRetrans */ |
203 | LINUX_MIB_TCPSLOWSTARTRETRANS, /* TCPSlowStartRetrans */ | 203 | LINUX_MIB_TCPSLOWSTARTRETRANS, /* TCPSlowStartRetrans */ |
204 | LINUX_MIB_TCPTIMEOUTS, /* TCPTimeouts */ | 204 | LINUX_MIB_TCPTIMEOUTS, /* TCPTimeouts */ |
205 | LINUX_MIB_TCPLOSSPROBES, /* TCPLossProbes */ | ||
205 | LINUX_MIB_TCPRENORECOVERYFAIL, /* TCPRenoRecoveryFail */ | 206 | LINUX_MIB_TCPRENORECOVERYFAIL, /* TCPRenoRecoveryFail */ |
206 | LINUX_MIB_TCPSACKRECOVERYFAIL, /* TCPSackRecoveryFail */ | 207 | LINUX_MIB_TCPSACKRECOVERYFAIL, /* TCPSackRecoveryFail */ |
207 | LINUX_MIB_TCPSCHEDULERFAILED, /* TCPSchedulerFailed */ | 208 | LINUX_MIB_TCPSCHEDULERFAILED, /* TCPSchedulerFailed */ |