aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYuchung Cheng <ycheng@google.com>2015-10-17 00:57:42 -0400
committerDavid S. Miller <davem@davemloft.net>2015-10-21 10:00:43 -0400
commitf672258391b42a5c7cc2732c9c063e56a85c8dbe (patch)
tree7b94f91a3b04fd478a4ea08eaa52edbc38492a99
parent9e45a3e36b363cc4c79c70f2b4f994e66543a219 (diff)
tcp: track min RTT using windowed min-filter
Kathleen Nichols' algorithm for tracking the minimum RTT of a data stream over some measurement window. It uses constant space and constant time per update. Yet it almost always delivers the same minimum as an implementation that has to keep all the data in the window. The measurement window is tunable via sysctl.net.ipv4.tcp_min_rtt_wlen with a default value of 5 minutes. The algorithm keeps track of the best, 2nd best & 3rd best min values, maintaining an invariant that the measurement time of the n'th best >= n-1'th best. It also makes sure that the three values are widely separated in the time window since that bounds the worse case error when that data is monotonically increasing over the window. Upon getting a new min, we can forget everything earlier because it has no value - the new min is less than everything else in the window by definition and it's the most recent. So we restart fresh on every new min and overwrites the 2nd & 3rd choices. The same property holds for the 2nd & 3rd best. Therefore we have to maintain two invariants to maximize the information in the samples, one on values (1st.v <= 2nd.v <= 3rd.v) and the other on times (now-win <=1st.t <= 2nd.t <= 3rd.t <= now). These invariants determine the structure of the code The RTT input to the windowed filter is the minimum RTT measured from ACK or SACK, or as the last resort from TCP timestamps. The accessor tcp_min_rtt() returns the minimum RTT seen in the window. ~0U indicates it is not available. The minimum is 1usec even if the true RTT is below that. Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--Documentation/networking/ip-sysctl.txt8
-rw-r--r--include/linux/tcp.h3
-rw-r--r--include/net/tcp.h7
-rw-r--r--net/ipv4/sysctl_net_ipv4.c7
-rw-r--r--net/ipv4/tcp.c1
-rw-r--r--net/ipv4/tcp_input.c78
-rw-r--r--net/ipv4/tcp_minisocks.c1
7 files changed, 100 insertions, 5 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index ebe94f2cab98..502d6a572b4f 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -384,6 +384,14 @@ tcp_mem - vector of 3 INTEGERs: min, pressure, max
384 Defaults are calculated at boot time from amount of available 384 Defaults are calculated at boot time from amount of available
385 memory. 385 memory.
386 386
387tcp_min_rtt_wlen - INTEGER
388 The window length of the windowed min filter to track the minimum RTT.
389 A shorter window lets a flow more quickly pick up new (higher)
390 minimum RTT when it is moved to a longer path (e.g., due to traffic
391 engineering). A longer window makes the filter more resistant to RTT
392 inflations such as transient congestion. The unit is seconds.
393 Default: 300
394
387tcp_moderate_rcvbuf - BOOLEAN 395tcp_moderate_rcvbuf - BOOLEAN
388 If set, TCP performs receive buffer auto-tuning, attempting to 396 If set, TCP performs receive buffer auto-tuning, attempting to
389 automatically size the buffer (no greater than tcp_rmem[2]) to 397 automatically size the buffer (no greater than tcp_rmem[2]) to
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 86a7edaa6797..90edef5508f9 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -217,6 +217,9 @@ struct tcp_sock {
217 u32 mdev_max_us; /* maximal mdev for the last rtt period */ 217 u32 mdev_max_us; /* maximal mdev for the last rtt period */
218 u32 rttvar_us; /* smoothed mdev_max */ 218 u32 rttvar_us; /* smoothed mdev_max */
219 u32 rtt_seq; /* sequence number to update rttvar */ 219 u32 rtt_seq; /* sequence number to update rttvar */
220 struct rtt_meas {
221 u32 rtt, ts; /* RTT in usec and sampling time in jiffies. */
222 } rtt_min[3];
220 223
221 u32 packets_out; /* Packets which are "in flight" */ 224 u32 packets_out; /* Packets which are "in flight" */
222 u32 retrans_out; /* Retransmitted packets out */ 225 u32 retrans_out; /* Retransmitted packets out */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index eed94fc355c1..4a43152229ea 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -279,6 +279,7 @@ extern int sysctl_tcp_limit_output_bytes;
279extern int sysctl_tcp_challenge_ack_limit; 279extern int sysctl_tcp_challenge_ack_limit;
280extern unsigned int sysctl_tcp_notsent_lowat; 280extern unsigned int sysctl_tcp_notsent_lowat;
281extern int sysctl_tcp_min_tso_segs; 281extern int sysctl_tcp_min_tso_segs;
282extern int sysctl_tcp_min_rtt_wlen;
282extern int sysctl_tcp_autocorking; 283extern int sysctl_tcp_autocorking;
283extern int sysctl_tcp_invalid_ratelimit; 284extern int sysctl_tcp_invalid_ratelimit;
284extern int sysctl_tcp_pacing_ss_ratio; 285extern int sysctl_tcp_pacing_ss_ratio;
@@ -671,6 +672,12 @@ static inline bool tcp_ca_dst_locked(const struct dst_entry *dst)
671 return dst_metric_locked(dst, RTAX_CC_ALGO); 672 return dst_metric_locked(dst, RTAX_CC_ALGO);
672} 673}
673 674
675/* Minimum RTT in usec. ~0 means not available. */
676static inline u32 tcp_min_rtt(const struct tcp_sock *tp)
677{
678 return tp->rtt_min[0].rtt;
679}
680
674/* Compute the actual receive window we are currently advertising. 681/* Compute the actual receive window we are currently advertising.
675 * Rcv_nxt can be after the window if our peer push more data 682 * Rcv_nxt can be after the window if our peer push more data
676 * than the offered window. 683 * than the offered window.
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 894da3a70aff..13ab434c2909 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -577,6 +577,13 @@ static struct ctl_table ipv4_table[] = {
577 .proc_handler = proc_dointvec 577 .proc_handler = proc_dointvec
578 }, 578 },
579 { 579 {
580 .procname = "tcp_min_rtt_wlen",
581 .data = &sysctl_tcp_min_rtt_wlen,
582 .maxlen = sizeof(int),
583 .mode = 0644,
584 .proc_handler = proc_dointvec
585 },
586 {
580 .procname = "tcp_low_latency", 587 .procname = "tcp_low_latency",
581 .data = &sysctl_tcp_low_latency, 588 .data = &sysctl_tcp_low_latency,
582 .maxlen = sizeof(int), 589 .maxlen = sizeof(int),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ac1bdbb50352..0cfa7c0c1e80 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -388,6 +388,7 @@ void tcp_init_sock(struct sock *sk)
388 388
389 icsk->icsk_rto = TCP_TIMEOUT_INIT; 389 icsk->icsk_rto = TCP_TIMEOUT_INIT;
390 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); 390 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
391 tp->rtt_min[0].rtt = ~0U;
391 392
392 /* So many TCP implementations out there (incorrectly) count the 393 /* So many TCP implementations out there (incorrectly) count the
393 * initial SYN frame in their delayed-ACK and congestion control 394 * initial SYN frame in their delayed-ACK and congestion control
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 62ee71efd1ce..eedb25db3947 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -95,6 +95,7 @@ int sysctl_tcp_stdurg __read_mostly;
95int sysctl_tcp_rfc1337 __read_mostly; 95int sysctl_tcp_rfc1337 __read_mostly;
96int sysctl_tcp_max_orphans __read_mostly = NR_FILE; 96int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
97int sysctl_tcp_frto __read_mostly = 2; 97int sysctl_tcp_frto __read_mostly = 2;
98int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
98 99
99int sysctl_tcp_thin_dupack __read_mostly; 100int sysctl_tcp_thin_dupack __read_mostly;
100 101
@@ -2915,8 +2916,69 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
2915 tcp_xmit_retransmit_queue(sk); 2916 tcp_xmit_retransmit_queue(sk);
2916} 2917}
2917 2918
2919/* Kathleen Nichols' algorithm for tracking the minimum value of
2920 * a data stream over some fixed time interval. (E.g., the minimum
2921 * RTT over the past five minutes.) It uses constant space and constant
2922 * time per update yet almost always delivers the same minimum as an
2923 * implementation that has to keep all the data in the window.
2924 *
2925 * The algorithm keeps track of the best, 2nd best & 3rd best min
2926 * values, maintaining an invariant that the measurement time of the
2927 * n'th best >= n-1'th best. It also makes sure that the three values
2928 * are widely separated in the time window since that bounds the worse
2929 * case error when that data is monotonically increasing over the window.
2930 *
2931 * Upon getting a new min, we can forget everything earlier because it
2932 * has no value - the new min is <= everything else in the window by
2933 * definition and it's the most recent. So we restart fresh on every new min
2934 * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd
2935 * best.
2936 */
2937static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
2938{
2939 const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ;
2940 struct rtt_meas *m = tcp_sk(sk)->rtt_min;
2941 struct rtt_meas rttm = { .rtt = (rtt_us ? : 1), .ts = now };
2942 u32 elapsed;
2943
2944 /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */
2945 if (unlikely(rttm.rtt <= m[0].rtt))
2946 m[0] = m[1] = m[2] = rttm;
2947 else if (rttm.rtt <= m[1].rtt)
2948 m[1] = m[2] = rttm;
2949 else if (rttm.rtt <= m[2].rtt)
2950 m[2] = rttm;
2951
2952 elapsed = now - m[0].ts;
2953 if (unlikely(elapsed > wlen)) {
2954 /* Passed entire window without a new min so make 2nd choice
2955 * the new min & 3rd choice the new 2nd. So forth and so on.
2956 */
2957 m[0] = m[1];
2958 m[1] = m[2];
2959 m[2] = rttm;
2960 if (now - m[0].ts > wlen) {
2961 m[0] = m[1];
2962 m[1] = rttm;
2963 if (now - m[0].ts > wlen)
2964 m[0] = rttm;
2965 }
2966 } else if (m[1].ts == m[0].ts && elapsed > wlen / 4) {
2967 /* Passed a quarter of the window without a new min so
2968 * take 2nd choice from the 2nd quarter of the window.
2969 */
2970 m[2] = m[1] = rttm;
2971 } else if (m[2].ts == m[1].ts && elapsed > wlen / 2) {
2972 /* Passed half the window without a new min so take the 3rd
2973 * choice from the last half of the window.
2974 */
2975 m[2] = rttm;
2976 }
2977}
2978
2918static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, 2979static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
2919 long seq_rtt_us, long sack_rtt_us) 2980 long seq_rtt_us, long sack_rtt_us,
2981 long ca_rtt_us)
2920{ 2982{
2921 const struct tcp_sock *tp = tcp_sk(sk); 2983 const struct tcp_sock *tp = tcp_sk(sk);
2922 2984
@@ -2936,11 +2998,16 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
2936 */ 2998 */
2937 if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && 2999 if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
2938 flag & FLAG_ACKED) 3000 flag & FLAG_ACKED)
2939 seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr); 3001 seq_rtt_us = ca_rtt_us = jiffies_to_usecs(tcp_time_stamp -
2940 3002 tp->rx_opt.rcv_tsecr);
2941 if (seq_rtt_us < 0) 3003 if (seq_rtt_us < 0)
2942 return false; 3004 return false;
2943 3005
3006 /* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is
3007 * always taken together with ACK, SACK, or TS-opts. Any negative
3008 * values will be skipped with the seq_rtt_us < 0 check above.
3009 */
3010 tcp_update_rtt_min(sk, ca_rtt_us);
2944 tcp_rtt_estimator(sk, seq_rtt_us); 3011 tcp_rtt_estimator(sk, seq_rtt_us);
2945 tcp_set_rto(sk); 3012 tcp_set_rto(sk);
2946 3013
@@ -2961,7 +3028,7 @@ void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
2961 rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack); 3028 rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack);
2962 } 3029 }
2963 3030
2964 tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L); 3031 tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us);
2965} 3032}
2966 3033
2967 3034
@@ -3175,7 +3242,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3175 ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt); 3242 ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt);
3176 } 3243 }
3177 3244
3178 rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us); 3245 rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
3246 ca_rtt_us);
3179 3247
3180 if (flag & FLAG_ACKED) { 3248 if (flag & FLAG_ACKED) {
3181 tcp_rearm_rto(sk); 3249 tcp_rearm_rto(sk);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 41828bdc5d32..b875c288daaa 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -470,6 +470,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
470 470
471 newtp->srtt_us = 0; 471 newtp->srtt_us = 0;
472 newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); 472 newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
473 newtp->rtt_min[0].rtt = ~0U;
473 newicsk->icsk_rto = TCP_TIMEOUT_INIT; 474 newicsk->icsk_rto = TCP_TIMEOUT_INIT;
474 475
475 newtp->packets_out = 0; 476 newtp->packets_out = 0;