aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c78
1 files changed, 73 insertions, 5 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 62ee71efd1ce..eedb25db3947 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -95,6 +95,7 @@ int sysctl_tcp_stdurg __read_mostly;
95int sysctl_tcp_rfc1337 __read_mostly; 95int sysctl_tcp_rfc1337 __read_mostly;
96int sysctl_tcp_max_orphans __read_mostly = NR_FILE; 96int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
97int sysctl_tcp_frto __read_mostly = 2; 97int sysctl_tcp_frto __read_mostly = 2;
98int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
98 99
99int sysctl_tcp_thin_dupack __read_mostly; 100int sysctl_tcp_thin_dupack __read_mostly;
100 101
@@ -2915,8 +2916,69 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
2915 tcp_xmit_retransmit_queue(sk); 2916 tcp_xmit_retransmit_queue(sk);
2916} 2917}
2917 2918
2919/* Kathleen Nichols' algorithm for tracking the minimum value of
2920 * a data stream over some fixed time interval. (E.g., the minimum
2921 * RTT over the past five minutes.) It uses constant space and constant
2922 * time per update yet almost always delivers the same minimum as an
2923 * implementation that has to keep all the data in the window.
2924 *
2925 * The algorithm keeps track of the best, 2nd best & 3rd best min
2926 * values, maintaining an invariant that the measurement time of the
2927 * n'th best >= n-1'th best. It also makes sure that the three values
2928 * are widely separated in the time window since that bounds the worse
2929 * case error when that data is monotonically increasing over the window.
2930 *
2931 * Upon getting a new min, we can forget everything earlier because it
2932 * has no value - the new min is <= everything else in the window by
2933 * definition and it's the most recent. So we restart fresh on every new min
2934 * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd
2935 * best.
2936 */
2937static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
2938{
2939 const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ;
2940 struct rtt_meas *m = tcp_sk(sk)->rtt_min;
2941 struct rtt_meas rttm = { .rtt = (rtt_us ? : 1), .ts = now };
2942 u32 elapsed;
2943
2944 /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */
2945 if (unlikely(rttm.rtt <= m[0].rtt))
2946 m[0] = m[1] = m[2] = rttm;
2947 else if (rttm.rtt <= m[1].rtt)
2948 m[1] = m[2] = rttm;
2949 else if (rttm.rtt <= m[2].rtt)
2950 m[2] = rttm;
2951
2952 elapsed = now - m[0].ts;
2953 if (unlikely(elapsed > wlen)) {
2954 /* Passed entire window without a new min so make 2nd choice
2955 * the new min & 3rd choice the new 2nd. So forth and so on.
2956 */
2957 m[0] = m[1];
2958 m[1] = m[2];
2959 m[2] = rttm;
2960 if (now - m[0].ts > wlen) {
2961 m[0] = m[1];
2962 m[1] = rttm;
2963 if (now - m[0].ts > wlen)
2964 m[0] = rttm;
2965 }
2966 } else if (m[1].ts == m[0].ts && elapsed > wlen / 4) {
2967 /* Passed a quarter of the window without a new min so
2968 * take 2nd choice from the 2nd quarter of the window.
2969 */
2970 m[2] = m[1] = rttm;
2971 } else if (m[2].ts == m[1].ts && elapsed > wlen / 2) {
2972 /* Passed half the window without a new min so take the 3rd
2973 * choice from the last half of the window.
2974 */
2975 m[2] = rttm;
2976 }
2977}
2978
2918static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, 2979static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
2919 long seq_rtt_us, long sack_rtt_us) 2980 long seq_rtt_us, long sack_rtt_us,
2981 long ca_rtt_us)
2920{ 2982{
2921 const struct tcp_sock *tp = tcp_sk(sk); 2983 const struct tcp_sock *tp = tcp_sk(sk);
2922 2984
@@ -2936,11 +2998,16 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
2936 */ 2998 */
2937 if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && 2999 if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
2938 flag & FLAG_ACKED) 3000 flag & FLAG_ACKED)
2939 seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr); 3001 seq_rtt_us = ca_rtt_us = jiffies_to_usecs(tcp_time_stamp -
2940 3002 tp->rx_opt.rcv_tsecr);
2941 if (seq_rtt_us < 0) 3003 if (seq_rtt_us < 0)
2942 return false; 3004 return false;
2943 3005
3006 /* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is
3007 * always taken together with ACK, SACK, or TS-opts. Any negative
3008 * values will be skipped with the seq_rtt_us < 0 check above.
3009 */
3010 tcp_update_rtt_min(sk, ca_rtt_us);
2944 tcp_rtt_estimator(sk, seq_rtt_us); 3011 tcp_rtt_estimator(sk, seq_rtt_us);
2945 tcp_set_rto(sk); 3012 tcp_set_rto(sk);
2946 3013
@@ -2961,7 +3028,7 @@ void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
2961 rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack); 3028 rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack);
2962 } 3029 }
2963 3030
2964 tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L); 3031 tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us);
2965} 3032}
2966 3033
2967 3034
@@ -3175,7 +3242,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3175 ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt); 3242 ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt);
3176 } 3243 }
3177 3244
3178 rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us); 3245 rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
3246 ca_rtt_us);
3179 3247
3180 if (flag & FLAG_ACKED) { 3248 if (flag & FLAG_ACKED) {
3181 tcp_rearm_rto(sk); 3249 tcp_rearm_rto(sk);