aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNandita Dukkipati <nanditad@google.com>2013-03-11 06:00:43 -0400
committerDavid S. Miller <davem@davemloft.net>2013-03-12 08:30:34 -0400
commit6ba8a3b19e764b6a65e4030ab0999be50c291e6c (patch)
tree57ba4b6411762d1124a3e08577e32e86769c024f
parent83e519b63480e691d43ee106547b10941bfa0232 (diff)
tcp: Tail loss probe (TLP)
This patch series implement the Tail loss probe (TLP) algorithm described in http://tools.ietf.org/html/draft-dukkipati-tcpm-tcp-loss-probe-01. The first patch implements the basic algorithm. TLP's goal is to reduce tail latency of short transactions. It achieves this by converting retransmission timeouts (RTOs) occuring due to tail losses (losses at end of transactions) into fast recovery. TLP transmits one packet in two round-trips when a connection is in Open state and isn't receiving any ACKs. The transmitted packet, aka loss probe, can be either new or a retransmission. When there is tail loss, the ACK from a loss probe triggers FACK/early-retransmit based fast recovery, thus avoiding a costly RTO. In the absence of loss, there is no change in the connection state. PTO stands for probe timeout. It is a timer event indicating that an ACK is overdue and triggers a loss probe packet. The PTO value is set to max(2*SRTT, 10ms) and is adjusted to account for delayed ACK timer when there is only one oustanding packet. TLP Algorithm On transmission of new data in Open state: -> packets_out > 1: schedule PTO in max(2*SRTT, 10ms). -> packets_out == 1: schedule PTO in max(2*RTT, 1.5*RTT + 200ms) -> PTO = min(PTO, RTO) Conditions for scheduling PTO: -> Connection is in Open state. -> Connection is either cwnd limited or no new data to send. -> Number of probes per tail loss episode is limited to one. -> Connection is SACK enabled. When PTO fires: new_segment_exists: -> transmit new segment. -> packets_out++. cwnd remains same. no_new_packet: -> retransmit the last segment. Its ACK triggers FACK or early retransmit based recovery. ACK path: -> rearm RTO at start of ACK processing. -> reschedule PTO if need be. In addition, the patch includes a small variation to the Early Retransmit (ER) algorithm, such that ER and TLP together can in principle recover any N-degree of tail loss through fast recovery. TLP is controlled by the same sysctl as ER, tcp_early_retrans sysctl. tcp_early_retrans==0; disables TLP and ER. ==1; enables RFC5827 ER. ==2; delayed ER. ==3; TLP and delayed ER. [DEFAULT] ==4; TLP only. The TLP patch series have been extensively tested on Google Web servers. It is most effective for short Web trasactions, where it reduced RTOs by 15% and improved HTTP response time (average by 6%, 99th percentile by 10%). The transmitted probes account for <0.5% of the overall transmissions. Signed-off-by: Nandita Dukkipati <nanditad@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Acked-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--Documentation/networking/ip-sysctl.txt8
-rw-r--r--include/linux/tcp.h1
-rw-r--r--include/net/inet_connection_sock.h5
-rw-r--r--include/net/tcp.h6
-rw-r--r--include/uapi/linux/snmp.h1
-rw-r--r--net/ipv4/inet_diag.c4
-rw-r--r--net/ipv4/proc.c1
-rw-r--r--net/ipv4/sysctl_net_ipv4.c4
-rw-r--r--net/ipv4/tcp_input.c24
-rw-r--r--net/ipv4/tcp_ipv4.c4
-rw-r--r--net/ipv4/tcp_output.c128
-rw-r--r--net/ipv4/tcp_timer.c13
12 files changed, 171 insertions, 28 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index dc2dc87d2557..1cae6c383e1b 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -190,7 +190,9 @@ tcp_early_retrans - INTEGER
190 Enable Early Retransmit (ER), per RFC 5827. ER lowers the threshold 190 Enable Early Retransmit (ER), per RFC 5827. ER lowers the threshold
191 for triggering fast retransmit when the amount of outstanding data is 191 for triggering fast retransmit when the amount of outstanding data is
192 small and when no previously unsent data can be transmitted (such 192 small and when no previously unsent data can be transmitted (such
193 that limited transmit could be used). 193 that limited transmit could be used). Also controls the use of
194 Tail loss probe (TLP) that converts RTOs occuring due to tail
195 losses into fast recovery (draft-dukkipati-tcpm-tcp-loss-probe-01).
194 Possible values: 196 Possible values:
195 0 disables ER 197 0 disables ER
196 1 enables ER 198 1 enables ER
@@ -198,7 +200,9 @@ tcp_early_retrans - INTEGER
198 by a fourth of RTT. This mitigates connection falsely 200 by a fourth of RTT. This mitigates connection falsely
199 recovers when network has a small degree of reordering 201 recovers when network has a small degree of reordering
200 (less than 3 packets). 202 (less than 3 packets).
201 Default: 2 203 3 enables delayed ER and TLP.
204 4 enables TLP only.
205 Default: 3
202 206
203tcp_ecn - INTEGER 207tcp_ecn - INTEGER
204 Control use of Explicit Congestion Notification (ECN) by TCP. 208 Control use of Explicit Congestion Notification (ECN) by TCP.
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 515c3746b675..01860d74555c 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -201,7 +201,6 @@ struct tcp_sock {
201 unused : 1; 201 unused : 1;
202 u8 repair_queue; 202 u8 repair_queue;
203 u8 do_early_retrans:1,/* Enable RFC5827 early-retransmit */ 203 u8 do_early_retrans:1,/* Enable RFC5827 early-retransmit */
204 early_retrans_delayed:1, /* Delayed ER timer installed */
205 syn_data:1, /* SYN includes data */ 204 syn_data:1, /* SYN includes data */
206 syn_fastopen:1, /* SYN includes Fast Open option */ 205 syn_fastopen:1, /* SYN includes Fast Open option */
207 syn_data_acked:1;/* data in SYN is acked by SYN-ACK */ 206 syn_data_acked:1;/* data in SYN is acked by SYN-ACK */
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 183292722f6e..de2c78529afa 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -133,6 +133,8 @@ struct inet_connection_sock {
133#define ICSK_TIME_RETRANS 1 /* Retransmit timer */ 133#define ICSK_TIME_RETRANS 1 /* Retransmit timer */
134#define ICSK_TIME_DACK 2 /* Delayed ack timer */ 134#define ICSK_TIME_DACK 2 /* Delayed ack timer */
135#define ICSK_TIME_PROBE0 3 /* Zero window probe timer */ 135#define ICSK_TIME_PROBE0 3 /* Zero window probe timer */
136#define ICSK_TIME_EARLY_RETRANS 4 /* Early retransmit timer */
137#define ICSK_TIME_LOSS_PROBE 5 /* Tail loss probe timer */
136 138
137static inline struct inet_connection_sock *inet_csk(const struct sock *sk) 139static inline struct inet_connection_sock *inet_csk(const struct sock *sk)
138{ 140{
@@ -222,7 +224,8 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,
222 when = max_when; 224 when = max_when;
223 } 225 }
224 226
225 if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) { 227 if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 ||
228 what == ICSK_TIME_EARLY_RETRANS || what == ICSK_TIME_LOSS_PROBE) {
226 icsk->icsk_pending = what; 229 icsk->icsk_pending = what;
227 icsk->icsk_timeout = jiffies + when; 230 icsk->icsk_timeout = jiffies + when;
228 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); 231 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index a2baa5e4ba31..ab9f947b118b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -543,6 +543,8 @@ extern bool tcp_syn_flood_action(struct sock *sk,
543extern void tcp_push_one(struct sock *, unsigned int mss_now); 543extern void tcp_push_one(struct sock *, unsigned int mss_now);
544extern void tcp_send_ack(struct sock *sk); 544extern void tcp_send_ack(struct sock *sk);
545extern void tcp_send_delayed_ack(struct sock *sk); 545extern void tcp_send_delayed_ack(struct sock *sk);
546extern void tcp_send_loss_probe(struct sock *sk);
547extern bool tcp_schedule_loss_probe(struct sock *sk);
546 548
547/* tcp_input.c */ 549/* tcp_input.c */
548extern void tcp_cwnd_application_limited(struct sock *sk); 550extern void tcp_cwnd_application_limited(struct sock *sk);
@@ -873,8 +875,8 @@ static inline void tcp_enable_fack(struct tcp_sock *tp)
873static inline void tcp_enable_early_retrans(struct tcp_sock *tp) 875static inline void tcp_enable_early_retrans(struct tcp_sock *tp)
874{ 876{
875 tp->do_early_retrans = sysctl_tcp_early_retrans && 877 tp->do_early_retrans = sysctl_tcp_early_retrans &&
876 !sysctl_tcp_thin_dupack && sysctl_tcp_reordering == 3; 878 sysctl_tcp_early_retrans < 4 && !sysctl_tcp_thin_dupack &&
877 tp->early_retrans_delayed = 0; 879 sysctl_tcp_reordering == 3;
878} 880}
879 881
880static inline void tcp_disable_early_retrans(struct tcp_sock *tp) 882static inline void tcp_disable_early_retrans(struct tcp_sock *tp)
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index b49eab89c9fd..290bed6b085f 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -202,6 +202,7 @@ enum
202 LINUX_MIB_TCPFORWARDRETRANS, /* TCPForwardRetrans */ 202 LINUX_MIB_TCPFORWARDRETRANS, /* TCPForwardRetrans */
203 LINUX_MIB_TCPSLOWSTARTRETRANS, /* TCPSlowStartRetrans */ 203 LINUX_MIB_TCPSLOWSTARTRETRANS, /* TCPSlowStartRetrans */
204 LINUX_MIB_TCPTIMEOUTS, /* TCPTimeouts */ 204 LINUX_MIB_TCPTIMEOUTS, /* TCPTimeouts */
205 LINUX_MIB_TCPLOSSPROBES, /* TCPLossProbes */
205 LINUX_MIB_TCPRENORECOVERYFAIL, /* TCPRenoRecoveryFail */ 206 LINUX_MIB_TCPRENORECOVERYFAIL, /* TCPRenoRecoveryFail */
206 LINUX_MIB_TCPSACKRECOVERYFAIL, /* TCPSackRecoveryFail */ 207 LINUX_MIB_TCPSACKRECOVERYFAIL, /* TCPSackRecoveryFail */
207 LINUX_MIB_TCPSCHEDULERFAILED, /* TCPSchedulerFailed */ 208 LINUX_MIB_TCPSCHEDULERFAILED, /* TCPSchedulerFailed */
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 7afa2c3c788f..8620408af574 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -158,7 +158,9 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
158 158
159#define EXPIRES_IN_MS(tmo) DIV_ROUND_UP((tmo - jiffies) * 1000, HZ) 159#define EXPIRES_IN_MS(tmo) DIV_ROUND_UP((tmo - jiffies) * 1000, HZ)
160 160
161 if (icsk->icsk_pending == ICSK_TIME_RETRANS) { 161 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
162 icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
163 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
162 r->idiag_timer = 1; 164 r->idiag_timer = 1;
163 r->idiag_retrans = icsk->icsk_retransmits; 165 r->idiag_retrans = icsk->icsk_retransmits;
164 r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); 166 r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 32030a24e776..4c35911d935f 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -224,6 +224,7 @@ static const struct snmp_mib snmp4_net_list[] = {
224 SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS), 224 SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS),
225 SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS), 225 SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS),
226 SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS), 226 SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS),
227 SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES),
227 SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL), 228 SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
228 SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL), 229 SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
229 SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED), 230 SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED),
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 960fd29d9b8e..cca4550f4082 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -28,7 +28,7 @@
28 28
29static int zero; 29static int zero;
30static int one = 1; 30static int one = 1;
31static int two = 2; 31static int four = 4;
32static int tcp_retr1_max = 255; 32static int tcp_retr1_max = 255;
33static int ip_local_port_range_min[] = { 1, 1 }; 33static int ip_local_port_range_min[] = { 1, 1 };
34static int ip_local_port_range_max[] = { 65535, 65535 }; 34static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -760,7 +760,7 @@ static struct ctl_table ipv4_table[] = {
760 .mode = 0644, 760 .mode = 0644,
761 .proc_handler = proc_dointvec_minmax, 761 .proc_handler = proc_dointvec_minmax,
762 .extra1 = &zero, 762 .extra1 = &zero,
763 .extra2 = &two, 763 .extra2 = &four,
764 }, 764 },
765 { 765 {
766 .procname = "udp_mem", 766 .procname = "udp_mem",
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 0d9bdacce99f..b794f89ac1f2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -98,7 +98,7 @@ int sysctl_tcp_frto_response __read_mostly;
98int sysctl_tcp_thin_dupack __read_mostly; 98int sysctl_tcp_thin_dupack __read_mostly;
99 99
100int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; 100int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
101int sysctl_tcp_early_retrans __read_mostly = 2; 101int sysctl_tcp_early_retrans __read_mostly = 3;
102 102
103#define FLAG_DATA 0x01 /* Incoming frame contained data. */ 103#define FLAG_DATA 0x01 /* Incoming frame contained data. */
104#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ 104#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
@@ -2150,15 +2150,16 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
2150 * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples 2150 * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
2151 * available, or RTO is scheduled to fire first. 2151 * available, or RTO is scheduled to fire first.
2152 */ 2152 */
2153 if (sysctl_tcp_early_retrans < 2 || (flag & FLAG_ECE) || !tp->srtt) 2153 if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
2154 (flag & FLAG_ECE) || !tp->srtt)
2154 return false; 2155 return false;
2155 2156
2156 delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2)); 2157 delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2));
2157 if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) 2158 if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
2158 return false; 2159 return false;
2159 2160
2160 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, TCP_RTO_MAX); 2161 inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay,
2161 tp->early_retrans_delayed = 1; 2162 TCP_RTO_MAX);
2162 return true; 2163 return true;
2163} 2164}
2164 2165
@@ -2321,7 +2322,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
2321 * interval if appropriate. 2322 * interval if appropriate.
2322 */ 2323 */
2323 if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out && 2324 if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
2324 (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) && 2325 (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) &&
2325 !tcp_may_send_now(sk)) 2326 !tcp_may_send_now(sk))
2326 return !tcp_pause_early_retransmit(sk, flag); 2327 return !tcp_pause_early_retransmit(sk, flag);
2327 2328
@@ -3081,6 +3082,7 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
3081 */ 3082 */
3082void tcp_rearm_rto(struct sock *sk) 3083void tcp_rearm_rto(struct sock *sk)
3083{ 3084{
3085 const struct inet_connection_sock *icsk = inet_csk(sk);
3084 struct tcp_sock *tp = tcp_sk(sk); 3086 struct tcp_sock *tp = tcp_sk(sk);
3085 3087
3086 /* If the retrans timer is currently being used by Fast Open 3088 /* If the retrans timer is currently being used by Fast Open
@@ -3094,12 +3096,13 @@ void tcp_rearm_rto(struct sock *sk)
3094 } else { 3096 } else {
3095 u32 rto = inet_csk(sk)->icsk_rto; 3097 u32 rto = inet_csk(sk)->icsk_rto;
3096 /* Offset the time elapsed after installing regular RTO */ 3098 /* Offset the time elapsed after installing regular RTO */
3097 if (tp->early_retrans_delayed) { 3099 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
3100 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
3098 struct sk_buff *skb = tcp_write_queue_head(sk); 3101 struct sk_buff *skb = tcp_write_queue_head(sk);
3099 const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto; 3102 const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto;
3100 s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); 3103 s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
3101 /* delta may not be positive if the socket is locked 3104 /* delta may not be positive if the socket is locked
3102 * when the delayed ER timer fires and is rescheduled. 3105 * when the retrans timer fires and is rescheduled.
3103 */ 3106 */
3104 if (delta > 0) 3107 if (delta > 0)
3105 rto = delta; 3108 rto = delta;
@@ -3107,7 +3110,6 @@ void tcp_rearm_rto(struct sock *sk)
3107 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, 3110 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
3108 TCP_RTO_MAX); 3111 TCP_RTO_MAX);
3109 } 3112 }
3110 tp->early_retrans_delayed = 0;
3111} 3113}
3112 3114
3113/* This function is called when the delayed ER timer fires. TCP enters 3115/* This function is called when the delayed ER timer fires. TCP enters
@@ -3601,7 +3603,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3601 if (after(ack, tp->snd_nxt)) 3603 if (after(ack, tp->snd_nxt))
3602 goto invalid_ack; 3604 goto invalid_ack;
3603 3605
3604 if (tp->early_retrans_delayed) 3606 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
3607 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
3605 tcp_rearm_rto(sk); 3608 tcp_rearm_rto(sk);
3606 3609
3607 if (after(ack, prior_snd_una)) 3610 if (after(ack, prior_snd_una))
@@ -3678,6 +3681,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3678 if (dst) 3681 if (dst)
3679 dst_confirm(dst); 3682 dst_confirm(dst);
3680 } 3683 }
3684
3685 if (icsk->icsk_pending == ICSK_TIME_RETRANS)
3686 tcp_schedule_loss_probe(sk);
3681 return 1; 3687 return 1;
3682 3688
3683no_queue: 3689no_queue:
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 8cdee120a50c..b7ab868c8284 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2703,7 +2703,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2703 __u16 srcp = ntohs(inet->inet_sport); 2703 __u16 srcp = ntohs(inet->inet_sport);
2704 int rx_queue; 2704 int rx_queue;
2705 2705
2706 if (icsk->icsk_pending == ICSK_TIME_RETRANS) { 2706 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2707 icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2708 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2707 timer_active = 1; 2709 timer_active = 1;
2708 timer_expires = icsk->icsk_timeout; 2710 timer_expires = icsk->icsk_timeout;
2709 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { 2711 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e2b4461074da..beb63dbc85f5 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -74,6 +74,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
74/* Account for new data that has been sent to the network. */ 74/* Account for new data that has been sent to the network. */
75static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) 75static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
76{ 76{
77 struct inet_connection_sock *icsk = inet_csk(sk);
77 struct tcp_sock *tp = tcp_sk(sk); 78 struct tcp_sock *tp = tcp_sk(sk);
78 unsigned int prior_packets = tp->packets_out; 79 unsigned int prior_packets = tp->packets_out;
79 80
@@ -85,7 +86,8 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
85 tp->frto_counter = 3; 86 tp->frto_counter = 3;
86 87
87 tp->packets_out += tcp_skb_pcount(skb); 88 tp->packets_out += tcp_skb_pcount(skb);
88 if (!prior_packets || tp->early_retrans_delayed) 89 if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
90 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
89 tcp_rearm_rto(sk); 91 tcp_rearm_rto(sk);
90} 92}
91 93
@@ -1959,6 +1961,9 @@ static int tcp_mtu_probe(struct sock *sk)
1959 * snd_up-64k-mss .. snd_up cannot be large. However, taking into 1961 * snd_up-64k-mss .. snd_up cannot be large. However, taking into
1960 * account rare use of URG, this is not a big flaw. 1962 * account rare use of URG, this is not a big flaw.
1961 * 1963 *
1964 * Send at most one packet when push_one > 0. Temporarily ignore
1965 * cwnd limit to force at most one packet out when push_one == 2.
1966
1962 * Returns true, if no segments are in flight and we have queued segments, 1967 * Returns true, if no segments are in flight and we have queued segments,
1963 * but cannot send anything now because of SWS or another problem. 1968 * but cannot send anything now because of SWS or another problem.
1964 */ 1969 */
@@ -1994,8 +1999,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1994 goto repair; /* Skip network transmission */ 1999 goto repair; /* Skip network transmission */
1995 2000
1996 cwnd_quota = tcp_cwnd_test(tp, skb); 2001 cwnd_quota = tcp_cwnd_test(tp, skb);
1997 if (!cwnd_quota) 2002 if (!cwnd_quota) {
1998 break; 2003 if (push_one == 2)
2004 /* Force out a loss probe pkt. */
2005 cwnd_quota = 1;
2006 else
2007 break;
2008 }
1999 2009
2000 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) 2010 if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
2001 break; 2011 break;
@@ -2049,10 +2059,120 @@ repair:
2049 if (likely(sent_pkts)) { 2059 if (likely(sent_pkts)) {
2050 if (tcp_in_cwnd_reduction(sk)) 2060 if (tcp_in_cwnd_reduction(sk))
2051 tp->prr_out += sent_pkts; 2061 tp->prr_out += sent_pkts;
2062
2063 /* Send one loss probe per tail loss episode. */
2064 if (push_one != 2)
2065 tcp_schedule_loss_probe(sk);
2052 tcp_cwnd_validate(sk); 2066 tcp_cwnd_validate(sk);
2053 return false; 2067 return false;
2054 } 2068 }
2055 return !tp->packets_out && tcp_send_head(sk); 2069 return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));
2070}
2071
2072bool tcp_schedule_loss_probe(struct sock *sk)
2073{
2074 struct inet_connection_sock *icsk = inet_csk(sk);
2075 struct tcp_sock *tp = tcp_sk(sk);
2076 u32 timeout, tlp_time_stamp, rto_time_stamp;
2077 u32 rtt = tp->srtt >> 3;
2078
2079 if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
2080 return false;
2081 /* No consecutive loss probes. */
2082 if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
2083 tcp_rearm_rto(sk);
2084 return false;
2085 }
2086 /* Don't do any loss probe on a Fast Open connection before 3WHS
2087 * finishes.
2088 */
2089 if (sk->sk_state == TCP_SYN_RECV)
2090 return false;
2091
2092 /* TLP is only scheduled when next timer event is RTO. */
2093 if (icsk->icsk_pending != ICSK_TIME_RETRANS)
2094 return false;
2095
2096 /* Schedule a loss probe in 2*RTT for SACK capable connections
2097 * in Open state, that are either limited by cwnd or application.
2098 */
2099 if (sysctl_tcp_early_retrans < 3 || !rtt || !tp->packets_out ||
2100 !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
2101 return false;
2102
2103 if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
2104 tcp_send_head(sk))
2105 return false;
2106
2107 /* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account
2108 * for delayed ack when there's one outstanding packet.
2109 */
2110 timeout = rtt << 1;
2111 if (tp->packets_out == 1)
2112 timeout = max_t(u32, timeout,
2113 (rtt + (rtt >> 1) + TCP_DELACK_MAX));
2114 timeout = max_t(u32, timeout, msecs_to_jiffies(10));
2115
2116 /* If RTO is shorter, just schedule TLP in its place. */
2117 tlp_time_stamp = tcp_time_stamp + timeout;
2118 rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout;
2119 if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) {
2120 s32 delta = rto_time_stamp - tcp_time_stamp;
2121 if (delta > 0)
2122 timeout = delta;
2123 }
2124
2125 inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
2126 TCP_RTO_MAX);
2127 return true;
2128}
2129
2130/* When probe timeout (PTO) fires, send a new segment if one exists, else
2131 * retransmit the last segment.
2132 */
2133void tcp_send_loss_probe(struct sock *sk)
2134{
2135 struct sk_buff *skb;
2136 int pcount;
2137 int mss = tcp_current_mss(sk);
2138 int err = -1;
2139
2140 if (tcp_send_head(sk) != NULL) {
2141 err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
2142 goto rearm_timer;
2143 }
2144
2145 /* Retransmit last segment. */
2146 skb = tcp_write_queue_tail(sk);
2147 if (WARN_ON(!skb))
2148 goto rearm_timer;
2149
2150 pcount = tcp_skb_pcount(skb);
2151 if (WARN_ON(!pcount))
2152 goto rearm_timer;
2153
2154 if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
2155 if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss)))
2156 goto rearm_timer;
2157 skb = tcp_write_queue_tail(sk);
2158 }
2159
2160 if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
2161 goto rearm_timer;
2162
2163 /* Probe with zero data doesn't trigger fast recovery. */
2164 if (skb->len > 0)
2165 err = __tcp_retransmit_skb(sk, skb);
2166
2167rearm_timer:
2168 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2169 inet_csk(sk)->icsk_rto,
2170 TCP_RTO_MAX);
2171
2172 if (likely(!err))
2173 NET_INC_STATS_BH(sock_net(sk),
2174 LINUX_MIB_TCPLOSSPROBES);
2175 return;
2056} 2176}
2057 2177
2058/* Push out any pending frames which were held back due to 2178/* Push out any pending frames which were held back due to
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b78aac30c498..ecd61d54147f 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -342,10 +342,6 @@ void tcp_retransmit_timer(struct sock *sk)
342 struct tcp_sock *tp = tcp_sk(sk); 342 struct tcp_sock *tp = tcp_sk(sk);
343 struct inet_connection_sock *icsk = inet_csk(sk); 343 struct inet_connection_sock *icsk = inet_csk(sk);
344 344
345 if (tp->early_retrans_delayed) {
346 tcp_resume_early_retransmit(sk);
347 return;
348 }
349 if (tp->fastopen_rsk) { 345 if (tp->fastopen_rsk) {
350 WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && 346 WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
351 sk->sk_state != TCP_FIN_WAIT1); 347 sk->sk_state != TCP_FIN_WAIT1);
@@ -495,13 +491,20 @@ void tcp_write_timer_handler(struct sock *sk)
495 } 491 }
496 492
497 event = icsk->icsk_pending; 493 event = icsk->icsk_pending;
498 icsk->icsk_pending = 0;
499 494
500 switch (event) { 495 switch (event) {
496 case ICSK_TIME_EARLY_RETRANS:
497 tcp_resume_early_retransmit(sk);
498 break;
499 case ICSK_TIME_LOSS_PROBE:
500 tcp_send_loss_probe(sk);
501 break;
501 case ICSK_TIME_RETRANS: 502 case ICSK_TIME_RETRANS:
503 icsk->icsk_pending = 0;
502 tcp_retransmit_timer(sk); 504 tcp_retransmit_timer(sk);
503 break; 505 break;
504 case ICSK_TIME_PROBE0: 506 case ICSK_TIME_PROBE0:
507 icsk->icsk_pending = 0;
505 tcp_probe_timer(sk); 508 tcp_probe_timer(sk);
506 break; 509 break;
507 } 510 }