aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYuchung Cheng <ycheng@google.com>2015-10-17 00:57:47 -0400
committerDavid S. Miller <davem@davemloft.net>2015-10-21 10:00:53 -0400
commit4f41b1c58a32537542f14c1150099131613a5e8a (patch)
tree86f37587abdfeee11ad36c75dcae37adf8aa091f
parent659a8ad56f490279f0efee43a62ffa1ac914a4e0 (diff)
tcp: use RACK to detect losses
This patch implements the second half of RACK that uses the the most recent transmit time among all delivered packets to detect losses. tcp_rack_mark_lost() is called upon receiving a dubious ACK. It then checks if an not-yet-sacked packet was sent at least "reo_wnd" prior to the sent time of the most recently delivered. If so the packet is deemed lost. The "reo_wnd" reordering window starts with 1msec for fast loss detection and changes to min-RTT/4 when reordering is observed. We found 1msec accommodates well on tiny degree of reordering (<3 pkts) on faster links. We use min-RTT instead of SRTT because reordering is more of a path property but SRTT can be inflated by self-inflicated congestion. The factor of 4 is borrowed from the delayed early retransmit and seems to work reasonably well. Since RACK is still experimental, it is now used as a supplemental loss detection on top of existing algorithms. It is only effective after the fast recovery starts or after the timeout occurs. The fast recovery is still triggered by FACK and/or dupack threshold instead of RACK. We introduce a new sysctl net.ipv4.tcp_recovery for future experiments of loss recoveries. For now RACK can be disabled by setting it to 0. Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--Documentation/networking/ip-sysctl.txt9
-rw-r--r--include/net/tcp.h9
-rw-r--r--net/ipv4/sysctl_net_ipv4.c7
-rw-r--r--net/ipv4/tcp_input.c9
-rw-r--r--net/ipv4/tcp_recovery.c77
5 files changed, 109 insertions, 2 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 502d6a572b4f..85752c81c5ec 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -433,6 +433,15 @@ tcp_orphan_retries - INTEGER
433 you should think about lowering this value, such sockets 433 you should think about lowering this value, such sockets
434 may consume significant resources. Cf. tcp_max_orphans. 434 may consume significant resources. Cf. tcp_max_orphans.
435 435
436tcp_recovery - INTEGER
437 This value is a bitmap to enable various experimental loss recovery
438 features.
439
440 RACK: 0x1 enables the RACK loss detection for fast detection of lost
441 retransmissions and tail drops.
442
443 Default: 0x1
444
436tcp_reordering - INTEGER 445tcp_reordering - INTEGER
437 Initial reordering level of packets in a TCP stream. 446 Initial reordering level of packets in a TCP stream.
438 TCP stack can then dynamically adjust flow reordering level 447 TCP stack can then dynamically adjust flow reordering level
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3c3a9fe057d3..11e320412216 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -567,6 +567,7 @@ void tcp_resume_early_retransmit(struct sock *sk);
567void tcp_rearm_rto(struct sock *sk); 567void tcp_rearm_rto(struct sock *sk);
568void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); 568void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
569void tcp_reset(struct sock *sk); 569void tcp_reset(struct sock *sk);
570void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb);
570 571
571/* tcp_timer.c */ 572/* tcp_timer.c */
572void tcp_init_xmit_timers(struct sock *); 573void tcp_init_xmit_timers(struct sock *);
@@ -1752,6 +1753,14 @@ void tcp_init(void);
1752 1753
1753/* tcp_recovery.c */ 1754/* tcp_recovery.c */
1754 1755
1756/* Flags to enable various loss recovery features. See below */
1757extern int sysctl_tcp_recovery;
1758
1759/* Use TCP RACK to detect (some) tail and retransmit losses */
1760#define TCP_RACK_LOST_RETRANS 0x1
1761
1762extern int tcp_rack_mark_lost(struct sock *sk);
1763
1755extern void tcp_rack_advance(struct tcp_sock *tp, 1764extern void tcp_rack_advance(struct tcp_sock *tp,
1756 const struct skb_mstamp *xmit_time, u8 sacked); 1765 const struct skb_mstamp *xmit_time, u8 sacked);
1757 1766
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 13ab434c2909..25300c5e283b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -496,6 +496,13 @@ static struct ctl_table ipv4_table[] = {
496 .proc_handler = proc_dointvec 496 .proc_handler = proc_dointvec
497 }, 497 },
498 { 498 {
499 .procname = "tcp_recovery",
500 .data = &sysctl_tcp_recovery,
501 .maxlen = sizeof(int),
502 .mode = 0644,
503 .proc_handler = proc_dointvec,
504 },
505 {
499 .procname = "tcp_reordering", 506 .procname = "tcp_reordering",
500 .data = &sysctl_tcp_reordering, 507 .data = &sysctl_tcp_reordering,
501 .maxlen = sizeof(int), 508 .maxlen = sizeof(int),
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ce8370525832..fdd88c3803a6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -881,6 +881,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
881 881
882 if (metric > 0) 882 if (metric > 0)
883 tcp_disable_early_retrans(tp); 883 tcp_disable_early_retrans(tp);
884 tp->rack.reord = 1;
884} 885}
885 886
886/* This must be called before lost_out is incremented */ 887/* This must be called before lost_out is incremented */
@@ -906,8 +907,7 @@ static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
906 } 907 }
907} 908}
908 909
909static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, 910void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
910 struct sk_buff *skb)
911{ 911{
912 tcp_verify_retransmit_hint(tp, skb); 912 tcp_verify_retransmit_hint(tp, skb);
913 913
@@ -2806,6 +2806,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
2806 } 2806 }
2807 } 2807 }
2808 2808
2809 /* Use RACK to detect loss */
2810 if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS &&
2811 tcp_rack_mark_lost(sk))
2812 flag |= FLAG_LOST_RETRANS;
2813
2809 /* E. Process state. */ 2814 /* E. Process state. */
2810 switch (icsk->icsk_ca_state) { 2815 switch (icsk->icsk_ca_state) {
2811 case TCP_CA_Recovery: 2816 case TCP_CA_Recovery:
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 8f66a6584845..5353085fd0b2 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -1,6 +1,83 @@
1#include <linux/tcp.h> 1#include <linux/tcp.h>
2#include <net/tcp.h> 2#include <net/tcp.h>
3 3
4int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;
5
6/* Marks a packet lost, if some packet sent later has been (s)acked.
7 * The underlying idea is similar to the traditional dupthresh and FACK
8 * but they look at different metrics:
9 *
10 * dupthresh: 3 OOO packets delivered (packet count)
11 * FACK: sequence delta to highest sacked sequence (sequence space)
12 * RACK: sent time delta to the latest delivered packet (time domain)
13 *
14 * The advantage of RACK is it applies to both original and retransmitted
15 * packet and therefore is robust against tail losses. Another advantage
16 * is being more resilient to reordering by simply allowing some
17 * "settling delay", instead of tweaking the dupthresh.
18 *
19 * The current version is only used after recovery starts but can be
20 * easily extended to detect the first loss.
21 */
22int tcp_rack_mark_lost(struct sock *sk)
23{
24 struct tcp_sock *tp = tcp_sk(sk);
25 struct sk_buff *skb;
26 u32 reo_wnd, prior_retrans = tp->retrans_out;
27
28 if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
29 return 0;
30
31 /* Reset the advanced flag to avoid unnecessary queue scanning */
32 tp->rack.advanced = 0;
33
34 /* To be more reordering resilient, allow min_rtt/4 settling delay
35 * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
36 * RTT because reordering is often a path property and less related
37 * to queuing or delayed ACKs.
38 *
39 * TODO: measure and adapt to the observed reordering delay, and
40 * use a timer to retransmit like the delayed early retransmit.
41 */
42 reo_wnd = 1000;
43 if (tp->rack.reord && tcp_min_rtt(tp) != ~0U)
44 reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
45
46 tcp_for_write_queue(skb, sk) {
47 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
48
49 if (skb == tcp_send_head(sk))
50 break;
51
52 /* Skip ones already (s)acked */
53 if (!after(scb->end_seq, tp->snd_una) ||
54 scb->sacked & TCPCB_SACKED_ACKED)
55 continue;
56
57 if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) {
58
59 if (skb_mstamp_us_delta(&tp->rack.mstamp,
60 &skb->skb_mstamp) <= reo_wnd)
61 continue;
62
63 /* skb is lost if packet sent later is sacked */
64 tcp_skb_mark_lost_uncond_verify(tp, skb);
65 if (scb->sacked & TCPCB_SACKED_RETRANS) {
66 scb->sacked &= ~TCPCB_SACKED_RETRANS;
67 tp->retrans_out -= tcp_skb_pcount(skb);
68 NET_INC_STATS_BH(sock_net(sk),
69 LINUX_MIB_TCPLOSTRETRANSMIT);
70 }
71 } else if (!(scb->sacked & TCPCB_RETRANS)) {
72 /* Original data are sent sequentially so stop early
73 * b/c the rest are all sent after rack_sent
74 */
75 break;
76 }
77 }
78 return prior_retrans - tp->retrans_out;
79}
80
4/* Record the most recently (re)sent time among the (s)acked packets */ 81/* Record the most recently (re)sent time among the (s)acked packets */
5void tcp_rack_advance(struct tcp_sock *tp, 82void tcp_rack_advance(struct tcp_sock *tp,
6 const struct skb_mstamp *xmit_time, u8 sacked) 83 const struct skb_mstamp *xmit_time, u8 sacked)