aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
authorYuchung Cheng <ycheng@google.com>2015-10-17 00:57:42 -0400
committerDavid S. Miller <davem@davemloft.net>2015-10-21 10:00:43 -0400
commitf672258391b42a5c7cc2732c9c063e56a85c8dbe (patch)
tree7b94f91a3b04fd478a4ea08eaa52edbc38492a99 /net/ipv4/tcp_input.c
parent9e45a3e36b363cc4c79c70f2b4f994e66543a219 (diff)
tcp: track min RTT using windowed min-filter
Kathleen Nichols' algorithm for tracking the minimum RTT of a data stream over some measurement window. It uses constant space and constant time per update. Yet it almost always delivers the same minimum as an implementation that has to keep all the data in the window. The measurement window is tunable via sysctl.net.ipv4.tcp_min_rtt_wlen with a default value of 5 minutes. The algorithm keeps track of the best, 2nd best & 3rd best min values, maintaining an invariant that the measurement time of the n'th best >= n-1'th best. It also makes sure that the three values are widely separated in the time window since that bounds the worse case error when that data is monotonically increasing over the window. Upon getting a new min, we can forget everything earlier because it has no value - the new min is less than everything else in the window by definition and it's the most recent. So we restart fresh on every new min and overwrites the 2nd & 3rd choices. The same property holds for the 2nd & 3rd best. Therefore we have to maintain two invariants to maximize the information in the samples, one on values (1st.v <= 2nd.v <= 3rd.v) and the other on times (now-win <=1st.t <= 2nd.t <= 3rd.t <= now). These invariants determine the structure of the code The RTT input to the windowed filter is the minimum RTT measured from ACK or SACK, or as the last resort from TCP timestamps. The accessor tcp_min_rtt() returns the minimum RTT seen in the window. ~0U indicates it is not available. The minimum is 1usec even if the true RTT is below that. Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c78
1 files changed, 73 insertions, 5 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 62ee71efd1ce..eedb25db3947 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -95,6 +95,7 @@ int sysctl_tcp_stdurg __read_mostly;
95int sysctl_tcp_rfc1337 __read_mostly; 95int sysctl_tcp_rfc1337 __read_mostly;
96int sysctl_tcp_max_orphans __read_mostly = NR_FILE; 96int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
97int sysctl_tcp_frto __read_mostly = 2; 97int sysctl_tcp_frto __read_mostly = 2;
98int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
98 99
99int sysctl_tcp_thin_dupack __read_mostly; 100int sysctl_tcp_thin_dupack __read_mostly;
100 101
@@ -2915,8 +2916,69 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
2915 tcp_xmit_retransmit_queue(sk); 2916 tcp_xmit_retransmit_queue(sk);
2916} 2917}
2917 2918
2919/* Kathleen Nichols' algorithm for tracking the minimum value of
2920 * a data stream over some fixed time interval. (E.g., the minimum
2921 * RTT over the past five minutes.) It uses constant space and constant
2922 * time per update yet almost always delivers the same minimum as an
2923 * implementation that has to keep all the data in the window.
2924 *
2925 * The algorithm keeps track of the best, 2nd best & 3rd best min
2926 * values, maintaining an invariant that the measurement time of the
2927 * n'th best >= n-1'th best. It also makes sure that the three values
2928 * are widely separated in the time window since that bounds the worse
2929 * case error when that data is monotonically increasing over the window.
2930 *
2931 * Upon getting a new min, we can forget everything earlier because it
2932 * has no value - the new min is <= everything else in the window by
2933 * definition and it's the most recent. So we restart fresh on every new min
2934 * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd
2935 * best.
2936 */
2937static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
2938{
2939 const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ;
2940 struct rtt_meas *m = tcp_sk(sk)->rtt_min;
2941 struct rtt_meas rttm = { .rtt = (rtt_us ? : 1), .ts = now };
2942 u32 elapsed;
2943
2944 /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */
2945 if (unlikely(rttm.rtt <= m[0].rtt))
2946 m[0] = m[1] = m[2] = rttm;
2947 else if (rttm.rtt <= m[1].rtt)
2948 m[1] = m[2] = rttm;
2949 else if (rttm.rtt <= m[2].rtt)
2950 m[2] = rttm;
2951
2952 elapsed = now - m[0].ts;
2953 if (unlikely(elapsed > wlen)) {
2954 /* Passed entire window without a new min so make 2nd choice
2955 * the new min & 3rd choice the new 2nd. So forth and so on.
2956 */
2957 m[0] = m[1];
2958 m[1] = m[2];
2959 m[2] = rttm;
2960 if (now - m[0].ts > wlen) {
2961 m[0] = m[1];
2962 m[1] = rttm;
2963 if (now - m[0].ts > wlen)
2964 m[0] = rttm;
2965 }
2966 } else if (m[1].ts == m[0].ts && elapsed > wlen / 4) {
2967 /* Passed a quarter of the window without a new min so
2968 * take 2nd choice from the 2nd quarter of the window.
2969 */
2970 m[2] = m[1] = rttm;
2971 } else if (m[2].ts == m[1].ts && elapsed > wlen / 2) {
2972 /* Passed half the window without a new min so take the 3rd
2973 * choice from the last half of the window.
2974 */
2975 m[2] = rttm;
2976 }
2977}
2978
2918static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, 2979static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
2919 long seq_rtt_us, long sack_rtt_us) 2980 long seq_rtt_us, long sack_rtt_us,
2981 long ca_rtt_us)
2920{ 2982{
2921 const struct tcp_sock *tp = tcp_sk(sk); 2983 const struct tcp_sock *tp = tcp_sk(sk);
2922 2984
@@ -2936,11 +2998,16 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
2936 */ 2998 */
2937 if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && 2999 if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
2938 flag & FLAG_ACKED) 3000 flag & FLAG_ACKED)
2939 seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr); 3001 seq_rtt_us = ca_rtt_us = jiffies_to_usecs(tcp_time_stamp -
2940 3002 tp->rx_opt.rcv_tsecr);
2941 if (seq_rtt_us < 0) 3003 if (seq_rtt_us < 0)
2942 return false; 3004 return false;
2943 3005
3006 /* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is
3007 * always taken together with ACK, SACK, or TS-opts. Any negative
3008 * values will be skipped with the seq_rtt_us < 0 check above.
3009 */
3010 tcp_update_rtt_min(sk, ca_rtt_us);
2944 tcp_rtt_estimator(sk, seq_rtt_us); 3011 tcp_rtt_estimator(sk, seq_rtt_us);
2945 tcp_set_rto(sk); 3012 tcp_set_rto(sk);
2946 3013
@@ -2961,7 +3028,7 @@ void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
2961 rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack); 3028 rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack);
2962 } 3029 }
2963 3030
2964 tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L); 3031 tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us);
2965} 3032}
2966 3033
2967 3034
@@ -3175,7 +3242,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3175 ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt); 3242 ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt);
3176 } 3243 }
3177 3244
3178 rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us); 3245 rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
3246 ca_rtt_us);
3179 3247
3180 if (flag & FLAG_ACKED) { 3248 if (flag & FLAG_ACKED) {
3181 tcp_rearm_rto(sk); 3249 tcp_rearm_rto(sk);