aboutsummaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorNandita Dukkipati <nanditad@google.com>2013-03-11 06:00:43 -0400
committerDavid S. Miller <davem@davemloft.net>2013-03-12 08:30:34 -0400
commit6ba8a3b19e764b6a65e4030ab0999be50c291e6c (patch)
tree57ba4b6411762d1124a3e08577e32e86769c024f /include
parent83e519b63480e691d43ee106547b10941bfa0232 (diff)
tcp: Tail loss probe (TLP)
This patch series implement the Tail loss probe (TLP) algorithm described in http://tools.ietf.org/html/draft-dukkipati-tcpm-tcp-loss-probe-01. The first patch implements the basic algorithm. TLP's goal is to reduce tail latency of short transactions. It achieves this by converting retransmission timeouts (RTOs) occuring due to tail losses (losses at end of transactions) into fast recovery. TLP transmits one packet in two round-trips when a connection is in Open state and isn't receiving any ACKs. The transmitted packet, aka loss probe, can be either new or a retransmission. When there is tail loss, the ACK from a loss probe triggers FACK/early-retransmit based fast recovery, thus avoiding a costly RTO. In the absence of loss, there is no change in the connection state. PTO stands for probe timeout. It is a timer event indicating that an ACK is overdue and triggers a loss probe packet. The PTO value is set to max(2*SRTT, 10ms) and is adjusted to account for delayed ACK timer when there is only one oustanding packet. TLP Algorithm On transmission of new data in Open state: -> packets_out > 1: schedule PTO in max(2*SRTT, 10ms). -> packets_out == 1: schedule PTO in max(2*RTT, 1.5*RTT + 200ms) -> PTO = min(PTO, RTO) Conditions for scheduling PTO: -> Connection is in Open state. -> Connection is either cwnd limited or no new data to send. -> Number of probes per tail loss episode is limited to one. -> Connection is SACK enabled. When PTO fires: new_segment_exists: -> transmit new segment. -> packets_out++. cwnd remains same. no_new_packet: -> retransmit the last segment. Its ACK triggers FACK or early retransmit based recovery. ACK path: -> rearm RTO at start of ACK processing. -> reschedule PTO if need be. In addition, the patch includes a small variation to the Early Retransmit (ER) algorithm, such that ER and TLP together can in principle recover any N-degree of tail loss through fast recovery. TLP is controlled by the same sysctl as ER, tcp_early_retrans sysctl. tcp_early_retrans==0; disables TLP and ER. ==1; enables RFC5827 ER. ==2; delayed ER. ==3; TLP and delayed ER. [DEFAULT] ==4; TLP only. The TLP patch series have been extensively tested on Google Web servers. It is most effective for short Web trasactions, where it reduced RTOs by 15% and improved HTTP response time (average by 6%, 99th percentile by 10%). The transmitted probes account for <0.5% of the overall transmissions. Signed-off-by: Nandita Dukkipati <nanditad@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Acked-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include')
-rw-r--r--include/linux/tcp.h1
-rw-r--r--include/net/inet_connection_sock.h5
-rw-r--r--include/net/tcp.h6
-rw-r--r--include/uapi/linux/snmp.h1
4 files changed, 9 insertions, 4 deletions
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 515c3746b675..01860d74555c 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -201,7 +201,6 @@ struct tcp_sock {
201 unused : 1; 201 unused : 1;
202 u8 repair_queue; 202 u8 repair_queue;
203 u8 do_early_retrans:1,/* Enable RFC5827 early-retransmit */ 203 u8 do_early_retrans:1,/* Enable RFC5827 early-retransmit */
204 early_retrans_delayed:1, /* Delayed ER timer installed */
205 syn_data:1, /* SYN includes data */ 204 syn_data:1, /* SYN includes data */
206 syn_fastopen:1, /* SYN includes Fast Open option */ 205 syn_fastopen:1, /* SYN includes Fast Open option */
207 syn_data_acked:1;/* data in SYN is acked by SYN-ACK */ 206 syn_data_acked:1;/* data in SYN is acked by SYN-ACK */
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 183292722f6e..de2c78529afa 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -133,6 +133,8 @@ struct inet_connection_sock {
133#define ICSK_TIME_RETRANS 1 /* Retransmit timer */ 133#define ICSK_TIME_RETRANS 1 /* Retransmit timer */
134#define ICSK_TIME_DACK 2 /* Delayed ack timer */ 134#define ICSK_TIME_DACK 2 /* Delayed ack timer */
135#define ICSK_TIME_PROBE0 3 /* Zero window probe timer */ 135#define ICSK_TIME_PROBE0 3 /* Zero window probe timer */
136#define ICSK_TIME_EARLY_RETRANS 4 /* Early retransmit timer */
137#define ICSK_TIME_LOSS_PROBE 5 /* Tail loss probe timer */
136 138
137static inline struct inet_connection_sock *inet_csk(const struct sock *sk) 139static inline struct inet_connection_sock *inet_csk(const struct sock *sk)
138{ 140{
@@ -222,7 +224,8 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,
222 when = max_when; 224 when = max_when;
223 } 225 }
224 226
225 if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) { 227 if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 ||
228 what == ICSK_TIME_EARLY_RETRANS || what == ICSK_TIME_LOSS_PROBE) {
226 icsk->icsk_pending = what; 229 icsk->icsk_pending = what;
227 icsk->icsk_timeout = jiffies + when; 230 icsk->icsk_timeout = jiffies + when;
228 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); 231 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index a2baa5e4ba31..ab9f947b118b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -543,6 +543,8 @@ extern bool tcp_syn_flood_action(struct sock *sk,
543extern void tcp_push_one(struct sock *, unsigned int mss_now); 543extern void tcp_push_one(struct sock *, unsigned int mss_now);
544extern void tcp_send_ack(struct sock *sk); 544extern void tcp_send_ack(struct sock *sk);
545extern void tcp_send_delayed_ack(struct sock *sk); 545extern void tcp_send_delayed_ack(struct sock *sk);
546extern void tcp_send_loss_probe(struct sock *sk);
547extern bool tcp_schedule_loss_probe(struct sock *sk);
546 548
547/* tcp_input.c */ 549/* tcp_input.c */
548extern void tcp_cwnd_application_limited(struct sock *sk); 550extern void tcp_cwnd_application_limited(struct sock *sk);
@@ -873,8 +875,8 @@ static inline void tcp_enable_fack(struct tcp_sock *tp)
873static inline void tcp_enable_early_retrans(struct tcp_sock *tp) 875static inline void tcp_enable_early_retrans(struct tcp_sock *tp)
874{ 876{
875 tp->do_early_retrans = sysctl_tcp_early_retrans && 877 tp->do_early_retrans = sysctl_tcp_early_retrans &&
876 !sysctl_tcp_thin_dupack && sysctl_tcp_reordering == 3; 878 sysctl_tcp_early_retrans < 4 && !sysctl_tcp_thin_dupack &&
877 tp->early_retrans_delayed = 0; 879 sysctl_tcp_reordering == 3;
878} 880}
879 881
880static inline void tcp_disable_early_retrans(struct tcp_sock *tp) 882static inline void tcp_disable_early_retrans(struct tcp_sock *tp)
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index b49eab89c9fd..290bed6b085f 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -202,6 +202,7 @@ enum
202 LINUX_MIB_TCPFORWARDRETRANS, /* TCPForwardRetrans */ 202 LINUX_MIB_TCPFORWARDRETRANS, /* TCPForwardRetrans */
203 LINUX_MIB_TCPSLOWSTARTRETRANS, /* TCPSlowStartRetrans */ 203 LINUX_MIB_TCPSLOWSTARTRETRANS, /* TCPSlowStartRetrans */
204 LINUX_MIB_TCPTIMEOUTS, /* TCPTimeouts */ 204 LINUX_MIB_TCPTIMEOUTS, /* TCPTimeouts */
205 LINUX_MIB_TCPLOSSPROBES, /* TCPLossProbes */
205 LINUX_MIB_TCPRENORECOVERYFAIL, /* TCPRenoRecoveryFail */ 206 LINUX_MIB_TCPRENORECOVERYFAIL, /* TCPRenoRecoveryFail */
206 LINUX_MIB_TCPSACKRECOVERYFAIL, /* TCPSackRecoveryFail */ 207 LINUX_MIB_TCPSACKRECOVERYFAIL, /* TCPSackRecoveryFail */
207 LINUX_MIB_TCPSCHEDULERFAILED, /* TCPSchedulerFailed */ 208 LINUX_MIB_TCPSCHEDULERFAILED, /* TCPSchedulerFailed */