aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2015-10-21 10:00:59 -0400
committerDavid S. Miller <davem@davemloft.net>2015-10-21 10:00:59 -0400
commiteb9fae328faff9807a4ab5c1834b19f34dd155d4 (patch)
tree86f37587abdfeee11ad36c75dcae37adf8aa091f
parentc8fdc324916a864de753db6de6423b048c20cc0f (diff)
parent4f41b1c58a32537542f14c1150099131613a5e8a (diff)
Merge branch 'tcp-rack'
Yuchung Cheng says: ==================== RACK loss detection RACK (Recent ACK) loss recovery uses the notion of time instead of packet sequence (FACK) or counts (dupthresh). It's inspired by the FACK heuristic in tcp_mark_lost_retrans(): when a limited transmit (new data packet) is sacked in recovery, then any retransmission sent before that newly sacked packet was sent must have been lost, since at least one round trip time has elapsed. But that existing heuristic from tcp_mark_lost_retrans() has several limitations: 1) it can't detect tail drops since it depends on limited transmit 2) it's disabled upon reordering (assumes no reordering) 3) it's only enabled in fast recovery but not timeout recovery RACK addresses these limitations with a core idea: an unacknowledged packet P1 is deemed lost if a packet P2 that was sent later is is s/acked, since at least one round trip has passed. Since RACK cares about the time sequence instead of the data sequence of packets, it can detect tail drops when a later retransmission is s/acked, while FACK or dupthresh can't. For reordering RACK uses a dynamically adjusted reordering window ("reo_wnd") to reduce false positives on ever (small) degree of reordering, similar to the delayed Early Retransmit. In the current patch set RACK is only a supplemental loss detection and does not trigger fast recovery. However we are developing RACK to replace or consolidate FACK/dupthresh, early retransmit, and thin-dupack. These heuristics all implicitly bear the time notion. For example, the delayed Early Retransmit is simply applying RACK to trigger the fast recovery with small inflight. RACK requires measuring the minimum RTT. Tracking a global min is less robust due to traffic engineering pathing changes. Therefore it uses a windowed filter by Kathleen Nichols. The min RTT can also be useful for various other purposes like congestion control or stat monitoring. This patch has been used on Google servers for well over 1 year. RACK has also been implemented in the QUIC protocol. We are submitting an IETF draft as well. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--Documentation/networking/ip-sysctl.txt17
-rw-r--r--include/linux/skbuff.h9
-rw-r--r--include/linux/tcp.h11
-rw-r--r--include/net/tcp.h21
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/sysctl_net_ipv4.c14
-rw-r--r--net/ipv4/tcp.c1
-rw-r--r--net/ipv4/tcp_input.c180
-rw-r--r--net/ipv4/tcp_minisocks.c3
-rw-r--r--net/ipv4/tcp_output.c6
-rw-r--r--net/ipv4/tcp_recovery.c109
11 files changed, 286 insertions, 86 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index ebe94f2cab98..85752c81c5ec 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -384,6 +384,14 @@ tcp_mem - vector of 3 INTEGERs: min, pressure, max
384 Defaults are calculated at boot time from amount of available 384 Defaults are calculated at boot time from amount of available
385 memory. 385 memory.
386 386
387tcp_min_rtt_wlen - INTEGER
388 The window length of the windowed min filter to track the minimum RTT.
389 A shorter window lets a flow more quickly pick up new (higher)
390 minimum RTT when it is moved to a longer path (e.g., due to traffic
391 engineering). A longer window makes the filter more resistant to RTT
392 inflations such as transient congestion. The unit is seconds.
393 Default: 300
394
387tcp_moderate_rcvbuf - BOOLEAN 395tcp_moderate_rcvbuf - BOOLEAN
388 If set, TCP performs receive buffer auto-tuning, attempting to 396 If set, TCP performs receive buffer auto-tuning, attempting to
389 automatically size the buffer (no greater than tcp_rmem[2]) to 397 automatically size the buffer (no greater than tcp_rmem[2]) to
@@ -425,6 +433,15 @@ tcp_orphan_retries - INTEGER
425 you should think about lowering this value, such sockets 433 you should think about lowering this value, such sockets
426 may consume significant resources. Cf. tcp_max_orphans. 434 may consume significant resources. Cf. tcp_max_orphans.
427 435
436tcp_recovery - INTEGER
437 This value is a bitmap to enable various experimental loss recovery
438 features.
439
440 RACK: 0x1 enables the RACK loss detection for fast detection of lost
441 retransmissions and tail drops.
442
443 Default: 0x1
444
428tcp_reordering - INTEGER 445tcp_reordering - INTEGER
429 Initial reordering level of packets in a TCP stream. 446 Initial reordering level of packets in a TCP stream.
430 TCP stack can then dynamically adjust flow reordering level 447 TCP stack can then dynamically adjust flow reordering level
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4398411236f1..24f4dfd94c51 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -463,6 +463,15 @@ static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1,
463 return delta_us; 463 return delta_us;
464} 464}
465 465
466static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
467 const struct skb_mstamp *t0)
468{
469 s32 diff = t1->stamp_jiffies - t0->stamp_jiffies;
470
471 if (!diff)
472 diff = t1->stamp_us - t0->stamp_us;
473 return diff > 0;
474}
466 475
467/** 476/**
468 * struct sk_buff - socket buffer 477 * struct sk_buff - socket buffer
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 86a7edaa6797..5dce9705fe84 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -194,6 +194,12 @@ struct tcp_sock {
194 u32 window_clamp; /* Maximal window to advertise */ 194 u32 window_clamp; /* Maximal window to advertise */
195 u32 rcv_ssthresh; /* Current window clamp */ 195 u32 rcv_ssthresh; /* Current window clamp */
196 196
197 /* Information of the most recently (s)acked skb */
198 struct tcp_rack {
199 struct skb_mstamp mstamp; /* (Re)sent time of the skb */
200 u8 advanced; /* mstamp advanced since last lost marking */
201 u8 reord; /* reordering detected */
202 } rack;
197 u16 advmss; /* Advertised MSS */ 203 u16 advmss; /* Advertised MSS */
198 u8 unused; 204 u8 unused;
199 u8 nonagle : 4,/* Disable Nagle algorithm? */ 205 u8 nonagle : 4,/* Disable Nagle algorithm? */
@@ -217,6 +223,9 @@ struct tcp_sock {
217 u32 mdev_max_us; /* maximal mdev for the last rtt period */ 223 u32 mdev_max_us; /* maximal mdev for the last rtt period */
218 u32 rttvar_us; /* smoothed mdev_max */ 224 u32 rttvar_us; /* smoothed mdev_max */
219 u32 rtt_seq; /* sequence number to update rttvar */ 225 u32 rtt_seq; /* sequence number to update rttvar */
226 struct rtt_meas {
227 u32 rtt, ts; /* RTT in usec and sampling time in jiffies. */
228 } rtt_min[3];
220 229
221 u32 packets_out; /* Packets which are "in flight" */ 230 u32 packets_out; /* Packets which are "in flight" */
222 u32 retrans_out; /* Retransmitted packets out */ 231 u32 retrans_out; /* Retransmitted packets out */
@@ -280,8 +289,6 @@ struct tcp_sock {
280 int lost_cnt_hint; 289 int lost_cnt_hint;
281 u32 retransmit_high; /* L-bits may be on up to this seqno */ 290 u32 retransmit_high; /* L-bits may be on up to this seqno */
282 291
283 u32 lost_retrans_low; /* Sent seq after any rxmit (lowest) */
284
285 u32 prior_ssthresh; /* ssthresh saved at recovery start */ 292 u32 prior_ssthresh; /* ssthresh saved at recovery start */
286 u32 high_seq; /* snd_nxt at onset of congestion */ 293 u32 high_seq; /* snd_nxt at onset of congestion */
287 294
diff --git a/include/net/tcp.h b/include/net/tcp.h
index eed94fc355c1..11e320412216 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -279,6 +279,7 @@ extern int sysctl_tcp_limit_output_bytes;
279extern int sysctl_tcp_challenge_ack_limit; 279extern int sysctl_tcp_challenge_ack_limit;
280extern unsigned int sysctl_tcp_notsent_lowat; 280extern unsigned int sysctl_tcp_notsent_lowat;
281extern int sysctl_tcp_min_tso_segs; 281extern int sysctl_tcp_min_tso_segs;
282extern int sysctl_tcp_min_rtt_wlen;
282extern int sysctl_tcp_autocorking; 283extern int sysctl_tcp_autocorking;
283extern int sysctl_tcp_invalid_ratelimit; 284extern int sysctl_tcp_invalid_ratelimit;
284extern int sysctl_tcp_pacing_ss_ratio; 285extern int sysctl_tcp_pacing_ss_ratio;
@@ -566,6 +567,7 @@ void tcp_resume_early_retransmit(struct sock *sk);
566void tcp_rearm_rto(struct sock *sk); 567void tcp_rearm_rto(struct sock *sk);
567void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); 568void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
568void tcp_reset(struct sock *sk); 569void tcp_reset(struct sock *sk);
570void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb);
569 571
570/* tcp_timer.c */ 572/* tcp_timer.c */
571void tcp_init_xmit_timers(struct sock *); 573void tcp_init_xmit_timers(struct sock *);
@@ -671,6 +673,12 @@ static inline bool tcp_ca_dst_locked(const struct dst_entry *dst)
671 return dst_metric_locked(dst, RTAX_CC_ALGO); 673 return dst_metric_locked(dst, RTAX_CC_ALGO);
672} 674}
673 675
676/* Minimum RTT in usec. ~0 means not available. */
677static inline u32 tcp_min_rtt(const struct tcp_sock *tp)
678{
679 return tp->rtt_min[0].rtt;
680}
681
674/* Compute the actual receive window we are currently advertising. 682/* Compute the actual receive window we are currently advertising.
675 * Rcv_nxt can be after the window if our peer push more data 683 * Rcv_nxt can be after the window if our peer push more data
676 * than the offered window. 684 * than the offered window.
@@ -1743,6 +1751,19 @@ int tcpv4_offload_init(void);
1743void tcp_v4_init(void); 1751void tcp_v4_init(void);
1744void tcp_init(void); 1752void tcp_init(void);
1745 1753
1754/* tcp_recovery.c */
1755
1756/* Flags to enable various loss recovery features. See below */
1757extern int sysctl_tcp_recovery;
1758
1759/* Use TCP RACK to detect (some) tail and retransmit losses */
1760#define TCP_RACK_LOST_RETRANS 0x1
1761
1762extern int tcp_rack_mark_lost(struct sock *sk);
1763
1764extern void tcp_rack_advance(struct tcp_sock *tp,
1765 const struct skb_mstamp *xmit_time, u8 sacked);
1766
1746/* 1767/*
1747 * Save and compile IPv4 options, return a pointer to it 1768 * Save and compile IPv4 options, return a pointer to it
1748 */ 1769 */
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 89aacb630a53..c29809f765dc 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -8,6 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \
8 inet_timewait_sock.o inet_connection_sock.o \ 8 inet_timewait_sock.o inet_connection_sock.o \
9 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ 9 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
10 tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ 10 tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
11 tcp_recovery.o \
11 tcp_offload.o datagram.o raw.o udp.o udplite.o \ 12 tcp_offload.o datagram.o raw.o udp.o udplite.o \
12 udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ 13 udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
13 fib_frontend.o fib_semantics.o fib_trie.o \ 14 fib_frontend.o fib_semantics.o fib_trie.o \
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 894da3a70aff..25300c5e283b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -496,6 +496,13 @@ static struct ctl_table ipv4_table[] = {
496 .proc_handler = proc_dointvec 496 .proc_handler = proc_dointvec
497 }, 497 },
498 { 498 {
499 .procname = "tcp_recovery",
500 .data = &sysctl_tcp_recovery,
501 .maxlen = sizeof(int),
502 .mode = 0644,
503 .proc_handler = proc_dointvec,
504 },
505 {
499 .procname = "tcp_reordering", 506 .procname = "tcp_reordering",
500 .data = &sysctl_tcp_reordering, 507 .data = &sysctl_tcp_reordering,
501 .maxlen = sizeof(int), 508 .maxlen = sizeof(int),
@@ -577,6 +584,13 @@ static struct ctl_table ipv4_table[] = {
577 .proc_handler = proc_dointvec 584 .proc_handler = proc_dointvec
578 }, 585 },
579 { 586 {
587 .procname = "tcp_min_rtt_wlen",
588 .data = &sysctl_tcp_min_rtt_wlen,
589 .maxlen = sizeof(int),
590 .mode = 0644,
591 .proc_handler = proc_dointvec
592 },
593 {
580 .procname = "tcp_low_latency", 594 .procname = "tcp_low_latency",
581 .data = &sysctl_tcp_low_latency, 595 .data = &sysctl_tcp_low_latency,
582 .maxlen = sizeof(int), 596 .maxlen = sizeof(int),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ac1bdbb50352..0cfa7c0c1e80 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -388,6 +388,7 @@ void tcp_init_sock(struct sock *sk)
388 388
389 icsk->icsk_rto = TCP_TIMEOUT_INIT; 389 icsk->icsk_rto = TCP_TIMEOUT_INIT;
390 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); 390 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
391 tp->rtt_min[0].rtt = ~0U;
391 392
392 /* So many TCP implementations out there (incorrectly) count the 393 /* So many TCP implementations out there (incorrectly) count the
393 * initial SYN frame in their delayed-ACK and congestion control 394 * initial SYN frame in their delayed-ACK and congestion control
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 944eaca69115..fdd88c3803a6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -95,6 +95,7 @@ int sysctl_tcp_stdurg __read_mostly;
95int sysctl_tcp_rfc1337 __read_mostly; 95int sysctl_tcp_rfc1337 __read_mostly;
96int sysctl_tcp_max_orphans __read_mostly = NR_FILE; 96int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
97int sysctl_tcp_frto __read_mostly = 2; 97int sysctl_tcp_frto __read_mostly = 2;
98int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
98 99
99int sysctl_tcp_thin_dupack __read_mostly; 100int sysctl_tcp_thin_dupack __read_mostly;
100 101
@@ -880,6 +881,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
880 881
881 if (metric > 0) 882 if (metric > 0)
882 tcp_disable_early_retrans(tp); 883 tcp_disable_early_retrans(tp);
884 tp->rack.reord = 1;
883} 885}
884 886
885/* This must be called before lost_out is incremented */ 887/* This must be called before lost_out is incremented */
@@ -905,8 +907,7 @@ static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
905 } 907 }
906} 908}
907 909
908static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, 910void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
909 struct sk_buff *skb)
910{ 911{
911 tcp_verify_retransmit_hint(tp, skb); 912 tcp_verify_retransmit_hint(tp, skb);
912 913
@@ -1047,70 +1048,6 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
1047 return !before(start_seq, end_seq - tp->max_window); 1048 return !before(start_seq, end_seq - tp->max_window);
1048} 1049}
1049 1050
1050/* Check for lost retransmit. This superb idea is borrowed from "ratehalving".
1051 * Event "B". Later note: FACK people cheated me again 8), we have to account
1052 * for reordering! Ugly, but should help.
1053 *
1054 * Search retransmitted skbs from write_queue that were sent when snd_nxt was
1055 * less than what is now known to be received by the other end (derived from
1056 * highest SACK block). Also calculate the lowest snd_nxt among the remaining
1057 * retransmitted skbs to avoid some costly processing per ACKs.
1058 */
1059static void tcp_mark_lost_retrans(struct sock *sk, int *flag)
1060{
1061 const struct inet_connection_sock *icsk = inet_csk(sk);
1062 struct tcp_sock *tp = tcp_sk(sk);
1063 struct sk_buff *skb;
1064 int cnt = 0;
1065 u32 new_low_seq = tp->snd_nxt;
1066 u32 received_upto = tcp_highest_sack_seq(tp);
1067
1068 if (!tcp_is_fack(tp) || !tp->retrans_out ||
1069 !after(received_upto, tp->lost_retrans_low) ||
1070 icsk->icsk_ca_state != TCP_CA_Recovery)
1071 return;
1072
1073 tcp_for_write_queue(skb, sk) {
1074 u32 ack_seq = TCP_SKB_CB(skb)->ack_seq;
1075
1076 if (skb == tcp_send_head(sk))
1077 break;
1078 if (cnt == tp->retrans_out)
1079 break;
1080 if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
1081 continue;
1082
1083 if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS))
1084 continue;
1085
1086 /* TODO: We would like to get rid of tcp_is_fack(tp) only
1087 * constraint here (see above) but figuring out that at
1088 * least tp->reordering SACK blocks reside between ack_seq
1089 * and received_upto is not easy task to do cheaply with
1090 * the available datastructures.
1091 *
1092 * Whether FACK should check here for tp->reordering segs
1093 * in-between one could argue for either way (it would be
1094 * rather simple to implement as we could count fack_count
1095 * during the walk and do tp->fackets_out - fack_count).
1096 */
1097 if (after(received_upto, ack_seq)) {
1098 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1099 tp->retrans_out -= tcp_skb_pcount(skb);
1100 *flag |= FLAG_LOST_RETRANS;
1101 tcp_skb_mark_lost_uncond_verify(tp, skb);
1102 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
1103 } else {
1104 if (before(ack_seq, new_low_seq))
1105 new_low_seq = ack_seq;
1106 cnt += tcp_skb_pcount(skb);
1107 }
1108 }
1109
1110 if (tp->retrans_out)
1111 tp->lost_retrans_low = new_low_seq;
1112}
1113
1114static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, 1051static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
1115 struct tcp_sack_block_wire *sp, int num_sacks, 1052 struct tcp_sack_block_wire *sp, int num_sacks,
1116 u32 prior_snd_una) 1053 u32 prior_snd_una)
@@ -1236,6 +1173,8 @@ static u8 tcp_sacktag_one(struct sock *sk,
1236 return sacked; 1173 return sacked;
1237 1174
1238 if (!(sacked & TCPCB_SACKED_ACKED)) { 1175 if (!(sacked & TCPCB_SACKED_ACKED)) {
1176 tcp_rack_advance(tp, xmit_time, sacked);
1177
1239 if (sacked & TCPCB_SACKED_RETRANS) { 1178 if (sacked & TCPCB_SACKED_RETRANS) {
1240 /* If the segment is not tagged as lost, 1179 /* If the segment is not tagged as lost,
1241 * we do not clear RETRANS, believing 1180 * we do not clear RETRANS, believing
@@ -1837,7 +1776,6 @@ advance_sp:
1837 ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker)) 1776 ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
1838 tcp_update_reordering(sk, tp->fackets_out - state->reord, 0); 1777 tcp_update_reordering(sk, tp->fackets_out - state->reord, 0);
1839 1778
1840 tcp_mark_lost_retrans(sk, &state->flag);
1841 tcp_verify_left_out(tp); 1779 tcp_verify_left_out(tp);
1842out: 1780out:
1843 1781
@@ -2314,14 +2252,29 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
2314 tp->snd_cwnd_stamp = tcp_time_stamp; 2252 tp->snd_cwnd_stamp = tcp_time_stamp;
2315} 2253}
2316 2254
2255static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
2256{
2257 return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
2258 before(tp->rx_opt.rcv_tsecr, when);
2259}
2260
2261/* skb is spurious retransmitted if the returned timestamp echo
2262 * reply is prior to the skb transmission time
2263 */
2264static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
2265 const struct sk_buff *skb)
2266{
2267 return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
2268 tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb));
2269}
2270
2317/* Nothing was retransmitted or returned timestamp is less 2271/* Nothing was retransmitted or returned timestamp is less
2318 * than timestamp of the first retransmission. 2272 * than timestamp of the first retransmission.
2319 */ 2273 */
2320static inline bool tcp_packet_delayed(const struct tcp_sock *tp) 2274static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
2321{ 2275{
2322 return !tp->retrans_stamp || 2276 return !tp->retrans_stamp ||
2323 (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && 2277 tcp_tsopt_ecr_before(tp, tp->retrans_stamp);
2324 before(tp->rx_opt.rcv_tsecr, tp->retrans_stamp));
2325} 2278}
2326 2279
2327/* Undo procedures. */ 2280/* Undo procedures. */
@@ -2853,6 +2806,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
2853 } 2806 }
2854 } 2807 }
2855 2808
2809 /* Use RACK to detect loss */
2810 if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS &&
2811 tcp_rack_mark_lost(sk))
2812 flag |= FLAG_LOST_RETRANS;
2813
2856 /* E. Process state. */ 2814 /* E. Process state. */
2857 switch (icsk->icsk_ca_state) { 2815 switch (icsk->icsk_ca_state) {
2858 case TCP_CA_Recovery: 2816 case TCP_CA_Recovery:
@@ -2915,8 +2873,69 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
2915 tcp_xmit_retransmit_queue(sk); 2873 tcp_xmit_retransmit_queue(sk);
2916} 2874}
2917 2875
2876/* Kathleen Nichols' algorithm for tracking the minimum value of
2877 * a data stream over some fixed time interval. (E.g., the minimum
2878 * RTT over the past five minutes.) It uses constant space and constant
2879 * time per update yet almost always delivers the same minimum as an
2880 * implementation that has to keep all the data in the window.
2881 *
2882 * The algorithm keeps track of the best, 2nd best & 3rd best min
2883 * values, maintaining an invariant that the measurement time of the
2884 * n'th best >= n-1'th best. It also makes sure that the three values
2885 * are widely separated in the time window since that bounds the worse
2886 * case error when that data is monotonically increasing over the window.
2887 *
2888 * Upon getting a new min, we can forget everything earlier because it
2889 * has no value - the new min is <= everything else in the window by
2890 * definition and it's the most recent. So we restart fresh on every new min
2891 * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd
2892 * best.
2893 */
2894static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
2895{
2896 const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ;
2897 struct rtt_meas *m = tcp_sk(sk)->rtt_min;
2898 struct rtt_meas rttm = { .rtt = (rtt_us ? : 1), .ts = now };
2899 u32 elapsed;
2900
2901 /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */
2902 if (unlikely(rttm.rtt <= m[0].rtt))
2903 m[0] = m[1] = m[2] = rttm;
2904 else if (rttm.rtt <= m[1].rtt)
2905 m[1] = m[2] = rttm;
2906 else if (rttm.rtt <= m[2].rtt)
2907 m[2] = rttm;
2908
2909 elapsed = now - m[0].ts;
2910 if (unlikely(elapsed > wlen)) {
2911 /* Passed entire window without a new min so make 2nd choice
2912 * the new min & 3rd choice the new 2nd. So forth and so on.
2913 */
2914 m[0] = m[1];
2915 m[1] = m[2];
2916 m[2] = rttm;
2917 if (now - m[0].ts > wlen) {
2918 m[0] = m[1];
2919 m[1] = rttm;
2920 if (now - m[0].ts > wlen)
2921 m[0] = rttm;
2922 }
2923 } else if (m[1].ts == m[0].ts && elapsed > wlen / 4) {
2924 /* Passed a quarter of the window without a new min so
2925 * take 2nd choice from the 2nd quarter of the window.
2926 */
2927 m[2] = m[1] = rttm;
2928 } else if (m[2].ts == m[1].ts && elapsed > wlen / 2) {
2929 /* Passed half the window without a new min so take the 3rd
2930 * choice from the last half of the window.
2931 */
2932 m[2] = rttm;
2933 }
2934}
2935
2918static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, 2936static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
2919 long seq_rtt_us, long sack_rtt_us) 2937 long seq_rtt_us, long sack_rtt_us,
2938 long ca_rtt_us)
2920{ 2939{
2921 const struct tcp_sock *tp = tcp_sk(sk); 2940 const struct tcp_sock *tp = tcp_sk(sk);
2922 2941
@@ -2925,9 +2944,6 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
2925 * Karn's algorithm forbids taking RTT if some retransmitted data 2944 * Karn's algorithm forbids taking RTT if some retransmitted data
2926 * is acked (RFC6298). 2945 * is acked (RFC6298).
2927 */ 2946 */
2928 if (flag & FLAG_RETRANS_DATA_ACKED)
2929 seq_rtt_us = -1L;
2930
2931 if (seq_rtt_us < 0) 2947 if (seq_rtt_us < 0)
2932 seq_rtt_us = sack_rtt_us; 2948 seq_rtt_us = sack_rtt_us;
2933 2949
@@ -2939,11 +2955,16 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
2939 */ 2955 */
2940 if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && 2956 if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
2941 flag & FLAG_ACKED) 2957 flag & FLAG_ACKED)
2942 seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr); 2958 seq_rtt_us = ca_rtt_us = jiffies_to_usecs(tcp_time_stamp -
2943 2959 tp->rx_opt.rcv_tsecr);
2944 if (seq_rtt_us < 0) 2960 if (seq_rtt_us < 0)
2945 return false; 2961 return false;
2946 2962
2963 /* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is
2964 * always taken together with ACK, SACK, or TS-opts. Any negative
2965 * values will be skipped with the seq_rtt_us < 0 check above.
2966 */
2967 tcp_update_rtt_min(sk, ca_rtt_us);
2947 tcp_rtt_estimator(sk, seq_rtt_us); 2968 tcp_rtt_estimator(sk, seq_rtt_us);
2948 tcp_set_rto(sk); 2969 tcp_set_rto(sk);
2949 2970
@@ -2964,7 +2985,7 @@ void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
2964 rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack); 2985 rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack);
2965 } 2986 }
2966 2987
2967 tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L); 2988 tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us);
2968} 2989}
2969 2990
2970 2991
@@ -3131,6 +3152,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3131 3152
3132 if (sacked & TCPCB_SACKED_ACKED) 3153 if (sacked & TCPCB_SACKED_ACKED)
3133 tp->sacked_out -= acked_pcount; 3154 tp->sacked_out -= acked_pcount;
3155 else if (tcp_is_sack(tp) && !tcp_skb_spurious_retrans(tp, skb))
3156 tcp_rack_advance(tp, &skb->skb_mstamp, sacked);
3134 if (sacked & TCPCB_LOST) 3157 if (sacked & TCPCB_LOST)
3135 tp->lost_out -= acked_pcount; 3158 tp->lost_out -= acked_pcount;
3136 3159
@@ -3169,7 +3192,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3169 flag |= FLAG_SACK_RENEGING; 3192 flag |= FLAG_SACK_RENEGING;
3170 3193
3171 skb_mstamp_get(&now); 3194 skb_mstamp_get(&now);
3172 if (likely(first_ackt.v64)) { 3195 if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
3173 seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); 3196 seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
3174 ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); 3197 ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
3175 } 3198 }
@@ -3178,7 +3201,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3178 ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt); 3201 ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt);
3179 } 3202 }
3180 3203
3181 rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us); 3204 rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
3205 ca_rtt_us);
3182 3206
3183 if (flag & FLAG_ACKED) { 3207 if (flag & FLAG_ACKED) {
3184 tcp_rearm_rto(sk); 3208 tcp_rearm_rto(sk);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 41828bdc5d32..1fd5d413a664 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -470,6 +470,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
470 470
471 newtp->srtt_us = 0; 471 newtp->srtt_us = 0;
472 newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); 472 newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
473 newtp->rtt_min[0].rtt = ~0U;
473 newicsk->icsk_rto = TCP_TIMEOUT_INIT; 474 newicsk->icsk_rto = TCP_TIMEOUT_INIT;
474 475
475 newtp->packets_out = 0; 476 newtp->packets_out = 0;
@@ -547,6 +548,8 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
547 tcp_ecn_openreq_child(newtp, req); 548 tcp_ecn_openreq_child(newtp, req);
548 newtp->fastopen_rsk = NULL; 549 newtp->fastopen_rsk = NULL;
549 newtp->syn_data_acked = 0; 550 newtp->syn_data_acked = 0;
551 newtp->rack.mstamp.v64 = 0;
552 newtp->rack.advanced = 0;
550 553
551 newtp->saved_syn = req->saved_syn; 554 newtp->saved_syn = req->saved_syn;
552 req->saved_syn = NULL; 555 req->saved_syn = NULL;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 19adedb8c5cc..f6f7f9b4901b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2655,8 +2655,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2655 net_dbg_ratelimited("retrans_out leaked\n"); 2655 net_dbg_ratelimited("retrans_out leaked\n");
2656 } 2656 }
2657#endif 2657#endif
2658 if (!tp->retrans_out)
2659 tp->lost_retrans_low = tp->snd_nxt;
2660 TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; 2658 TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
2661 tp->retrans_out += tcp_skb_pcount(skb); 2659 tp->retrans_out += tcp_skb_pcount(skb);
2662 2660
@@ -2664,10 +2662,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2664 if (!tp->retrans_stamp) 2662 if (!tp->retrans_stamp)
2665 tp->retrans_stamp = tcp_skb_timestamp(skb); 2663 tp->retrans_stamp = tcp_skb_timestamp(skb);
2666 2664
2667 /* snd_nxt is stored to detect loss of retransmitted segment,
2668 * see tcp_input.c tcp_sacktag_write_queue().
2669 */
2670 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
2671 } else if (err != -EBUSY) { 2665 } else if (err != -EBUSY) {
2672 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); 2666 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
2673 } 2667 }
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
new file mode 100644
index 000000000000..5353085fd0b2
--- /dev/null
+++ b/net/ipv4/tcp_recovery.c
@@ -0,0 +1,109 @@
1#include <linux/tcp.h>
2#include <net/tcp.h>
3
4int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;
5
6/* Marks a packet lost, if some packet sent later has been (s)acked.
7 * The underlying idea is similar to the traditional dupthresh and FACK
8 * but they look at different metrics:
9 *
10 * dupthresh: 3 OOO packets delivered (packet count)
11 * FACK: sequence delta to highest sacked sequence (sequence space)
12 * RACK: sent time delta to the latest delivered packet (time domain)
13 *
14 * The advantage of RACK is it applies to both original and retransmitted
15 * packet and therefore is robust against tail losses. Another advantage
16 * is being more resilient to reordering by simply allowing some
17 * "settling delay", instead of tweaking the dupthresh.
18 *
19 * The current version is only used after recovery starts but can be
20 * easily extended to detect the first loss.
21 */
22int tcp_rack_mark_lost(struct sock *sk)
23{
24 struct tcp_sock *tp = tcp_sk(sk);
25 struct sk_buff *skb;
26 u32 reo_wnd, prior_retrans = tp->retrans_out;
27
28 if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
29 return 0;
30
31 /* Reset the advanced flag to avoid unnecessary queue scanning */
32 tp->rack.advanced = 0;
33
34 /* To be more reordering resilient, allow min_rtt/4 settling delay
35 * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
36 * RTT because reordering is often a path property and less related
37 * to queuing or delayed ACKs.
38 *
39 * TODO: measure and adapt to the observed reordering delay, and
40 * use a timer to retransmit like the delayed early retransmit.
41 */
42 reo_wnd = 1000;
43 if (tp->rack.reord && tcp_min_rtt(tp) != ~0U)
44 reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
45
46 tcp_for_write_queue(skb, sk) {
47 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
48
49 if (skb == tcp_send_head(sk))
50 break;
51
52 /* Skip ones already (s)acked */
53 if (!after(scb->end_seq, tp->snd_una) ||
54 scb->sacked & TCPCB_SACKED_ACKED)
55 continue;
56
57 if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) {
58
59 if (skb_mstamp_us_delta(&tp->rack.mstamp,
60 &skb->skb_mstamp) <= reo_wnd)
61 continue;
62
63 /* skb is lost if packet sent later is sacked */
64 tcp_skb_mark_lost_uncond_verify(tp, skb);
65 if (scb->sacked & TCPCB_SACKED_RETRANS) {
66 scb->sacked &= ~TCPCB_SACKED_RETRANS;
67 tp->retrans_out -= tcp_skb_pcount(skb);
68 NET_INC_STATS_BH(sock_net(sk),
69 LINUX_MIB_TCPLOSTRETRANSMIT);
70 }
71 } else if (!(scb->sacked & TCPCB_RETRANS)) {
72 /* Original data are sent sequentially so stop early
73 * b/c the rest are all sent after rack_sent
74 */
75 break;
76 }
77 }
78 return prior_retrans - tp->retrans_out;
79}
80
81/* Record the most recently (re)sent time among the (s)acked packets */
82void tcp_rack_advance(struct tcp_sock *tp,
83 const struct skb_mstamp *xmit_time, u8 sacked)
84{
85 if (tp->rack.mstamp.v64 &&
86 !skb_mstamp_after(xmit_time, &tp->rack.mstamp))
87 return;
88
89 if (sacked & TCPCB_RETRANS) {
90 struct skb_mstamp now;
91
92 /* If the sacked packet was retransmitted, it's ambiguous
93 * whether the retransmission or the original (or the prior
94 * retransmission) was sacked.
95 *
96 * If the original is lost, there is no ambiguity. Otherwise
97 * we assume the original can be delayed up to aRTT + min_rtt.
98 * the aRTT term is bounded by the fast recovery or timeout,
99 * so it's at least one RTT (i.e., retransmission is at least
100 * an RTT later).
101 */
102 skb_mstamp_get(&now);
103 if (skb_mstamp_us_delta(&now, xmit_time) < tcp_min_rtt(tp))
104 return;
105 }
106
107 tp->rack.mstamp = *xmit_time;
108 tp->rack.advanced = 1;
109}