diff options
-rw-r--r-- | Documentation/networking/ip-sysctl.txt | 8 | ||||
-rw-r--r-- | include/linux/tcp.h | 3 | ||||
-rw-r--r-- | include/net/tcp.h | 7 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 7 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 1 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 78 | ||||
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 1 |
7 files changed, 100 insertions, 5 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index ebe94f2cab98..502d6a572b4f 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt | |||
@@ -384,6 +384,14 @@ tcp_mem - vector of 3 INTEGERs: min, pressure, max | |||
384 | Defaults are calculated at boot time from amount of available | 384 | Defaults are calculated at boot time from amount of available |
385 | memory. | 385 | memory. |
386 | 386 | ||
387 | tcp_min_rtt_wlen - INTEGER | ||
388 | The window length of the windowed min filter to track the minimum RTT. | ||
389 | A shorter window lets a flow more quickly pick up new (higher) | ||
390 | minimum RTT when it is moved to a longer path (e.g., due to traffic | ||
391 | engineering). A longer window makes the filter more resistant to RTT | ||
392 | inflations such as transient congestion. The unit is seconds. | ||
393 | Default: 300 | ||
394 | |||
387 | tcp_moderate_rcvbuf - BOOLEAN | 395 | tcp_moderate_rcvbuf - BOOLEAN |
388 | If set, TCP performs receive buffer auto-tuning, attempting to | 396 | If set, TCP performs receive buffer auto-tuning, attempting to |
389 | automatically size the buffer (no greater than tcp_rmem[2]) to | 397 | automatically size the buffer (no greater than tcp_rmem[2]) to |
diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 86a7edaa6797..90edef5508f9 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h | |||
@@ -217,6 +217,9 @@ struct tcp_sock { | |||
217 | u32 mdev_max_us; /* maximal mdev for the last rtt period */ | 217 | u32 mdev_max_us; /* maximal mdev for the last rtt period */ |
218 | u32 rttvar_us; /* smoothed mdev_max */ | 218 | u32 rttvar_us; /* smoothed mdev_max */ |
219 | u32 rtt_seq; /* sequence number to update rttvar */ | 219 | u32 rtt_seq; /* sequence number to update rttvar */ |
220 | struct rtt_meas { | ||
221 | u32 rtt, ts; /* RTT in usec and sampling time in jiffies. */ | ||
222 | } rtt_min[3]; | ||
220 | 223 | ||
221 | u32 packets_out; /* Packets which are "in flight" */ | 224 | u32 packets_out; /* Packets which are "in flight" */ |
222 | u32 retrans_out; /* Retransmitted packets out */ | 225 | u32 retrans_out; /* Retransmitted packets out */ |
diff --git a/include/net/tcp.h b/include/net/tcp.h index eed94fc355c1..4a43152229ea 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h | |||
@@ -279,6 +279,7 @@ extern int sysctl_tcp_limit_output_bytes; | |||
279 | extern int sysctl_tcp_challenge_ack_limit; | 279 | extern int sysctl_tcp_challenge_ack_limit; |
280 | extern unsigned int sysctl_tcp_notsent_lowat; | 280 | extern unsigned int sysctl_tcp_notsent_lowat; |
281 | extern int sysctl_tcp_min_tso_segs; | 281 | extern int sysctl_tcp_min_tso_segs; |
282 | extern int sysctl_tcp_min_rtt_wlen; | ||
282 | extern int sysctl_tcp_autocorking; | 283 | extern int sysctl_tcp_autocorking; |
283 | extern int sysctl_tcp_invalid_ratelimit; | 284 | extern int sysctl_tcp_invalid_ratelimit; |
284 | extern int sysctl_tcp_pacing_ss_ratio; | 285 | extern int sysctl_tcp_pacing_ss_ratio; |
@@ -671,6 +672,12 @@ static inline bool tcp_ca_dst_locked(const struct dst_entry *dst) | |||
671 | return dst_metric_locked(dst, RTAX_CC_ALGO); | 672 | return dst_metric_locked(dst, RTAX_CC_ALGO); |
672 | } | 673 | } |
673 | 674 | ||
675 | /* Minimum RTT in usec. ~0 means not available. */ | ||
676 | static inline u32 tcp_min_rtt(const struct tcp_sock *tp) | ||
677 | { | ||
678 | return tp->rtt_min[0].rtt; | ||
679 | } | ||
680 | |||
674 | /* Compute the actual receive window we are currently advertising. | 681 | /* Compute the actual receive window we are currently advertising. |
675 | * Rcv_nxt can be after the window if our peer push more data | 682 | * Rcv_nxt can be after the window if our peer push more data |
676 | * than the offered window. | 683 | * than the offered window. |
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 894da3a70aff..13ab434c2909 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
@@ -577,6 +577,13 @@ static struct ctl_table ipv4_table[] = { | |||
577 | .proc_handler = proc_dointvec | 577 | .proc_handler = proc_dointvec |
578 | }, | 578 | }, |
579 | { | 579 | { |
580 | .procname = "tcp_min_rtt_wlen", | ||
581 | .data = &sysctl_tcp_min_rtt_wlen, | ||
582 | .maxlen = sizeof(int), | ||
583 | .mode = 0644, | ||
584 | .proc_handler = proc_dointvec | ||
585 | }, | ||
586 | { | ||
580 | .procname = "tcp_low_latency", | 587 | .procname = "tcp_low_latency", |
581 | .data = &sysctl_tcp_low_latency, | 588 | .data = &sysctl_tcp_low_latency, |
582 | .maxlen = sizeof(int), | 589 | .maxlen = sizeof(int), |
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ac1bdbb50352..0cfa7c0c1e80 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -388,6 +388,7 @@ void tcp_init_sock(struct sock *sk) | |||
388 | 388 | ||
389 | icsk->icsk_rto = TCP_TIMEOUT_INIT; | 389 | icsk->icsk_rto = TCP_TIMEOUT_INIT; |
390 | tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); | 390 | tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); |
391 | tp->rtt_min[0].rtt = ~0U; | ||
391 | 392 | ||
392 | /* So many TCP implementations out there (incorrectly) count the | 393 | /* So many TCP implementations out there (incorrectly) count the |
393 | * initial SYN frame in their delayed-ACK and congestion control | 394 | * initial SYN frame in their delayed-ACK and congestion control |
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 62ee71efd1ce..eedb25db3947 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -95,6 +95,7 @@ int sysctl_tcp_stdurg __read_mostly; | |||
95 | int sysctl_tcp_rfc1337 __read_mostly; | 95 | int sysctl_tcp_rfc1337 __read_mostly; |
96 | int sysctl_tcp_max_orphans __read_mostly = NR_FILE; | 96 | int sysctl_tcp_max_orphans __read_mostly = NR_FILE; |
97 | int sysctl_tcp_frto __read_mostly = 2; | 97 | int sysctl_tcp_frto __read_mostly = 2; |
98 | int sysctl_tcp_min_rtt_wlen __read_mostly = 300; | ||
98 | 99 | ||
99 | int sysctl_tcp_thin_dupack __read_mostly; | 100 | int sysctl_tcp_thin_dupack __read_mostly; |
100 | 101 | ||
@@ -2915,8 +2916,69 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, | |||
2915 | tcp_xmit_retransmit_queue(sk); | 2916 | tcp_xmit_retransmit_queue(sk); |
2916 | } | 2917 | } |
2917 | 2918 | ||
2919 | /* Kathleen Nichols' algorithm for tracking the minimum value of | ||
2920 | * a data stream over some fixed time interval. (E.g., the minimum | ||
2921 | * RTT over the past five minutes.) It uses constant space and constant | ||
2922 | * time per update yet almost always delivers the same minimum as an | ||
2923 | * implementation that has to keep all the data in the window. | ||
2924 | * | ||
2925 | * The algorithm keeps track of the best, 2nd best & 3rd best min | ||
2926 | * values, maintaining an invariant that the measurement time of the | ||
2927 | * n'th best >= n-1'th best. It also makes sure that the three values | ||
2928 | * are widely separated in the time window since that bounds the worse | ||
2929 | * case error when that data is monotonically increasing over the window. | ||
2930 | * | ||
2931 | * Upon getting a new min, we can forget everything earlier because it | ||
2932 | * has no value - the new min is <= everything else in the window by | ||
2933 | * definition and it's the most recent. So we restart fresh on every new min | ||
2934 | * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd | ||
2935 | * best. | ||
2936 | */ | ||
2937 | static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) | ||
2938 | { | ||
2939 | const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ; | ||
2940 | struct rtt_meas *m = tcp_sk(sk)->rtt_min; | ||
2941 | struct rtt_meas rttm = { .rtt = (rtt_us ? : 1), .ts = now }; | ||
2942 | u32 elapsed; | ||
2943 | |||
2944 | /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */ | ||
2945 | if (unlikely(rttm.rtt <= m[0].rtt)) | ||
2946 | m[0] = m[1] = m[2] = rttm; | ||
2947 | else if (rttm.rtt <= m[1].rtt) | ||
2948 | m[1] = m[2] = rttm; | ||
2949 | else if (rttm.rtt <= m[2].rtt) | ||
2950 | m[2] = rttm; | ||
2951 | |||
2952 | elapsed = now - m[0].ts; | ||
2953 | if (unlikely(elapsed > wlen)) { | ||
2954 | /* Passed entire window without a new min so make 2nd choice | ||
2955 | * the new min & 3rd choice the new 2nd. So forth and so on. | ||
2956 | */ | ||
2957 | m[0] = m[1]; | ||
2958 | m[1] = m[2]; | ||
2959 | m[2] = rttm; | ||
2960 | if (now - m[0].ts > wlen) { | ||
2961 | m[0] = m[1]; | ||
2962 | m[1] = rttm; | ||
2963 | if (now - m[0].ts > wlen) | ||
2964 | m[0] = rttm; | ||
2965 | } | ||
2966 | } else if (m[1].ts == m[0].ts && elapsed > wlen / 4) { | ||
2967 | /* Passed a quarter of the window without a new min so | ||
2968 | * take 2nd choice from the 2nd quarter of the window. | ||
2969 | */ | ||
2970 | m[2] = m[1] = rttm; | ||
2971 | } else if (m[2].ts == m[1].ts && elapsed > wlen / 2) { | ||
2972 | /* Passed half the window without a new min so take the 3rd | ||
2973 | * choice from the last half of the window. | ||
2974 | */ | ||
2975 | m[2] = rttm; | ||
2976 | } | ||
2977 | } | ||
2978 | |||
2918 | static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, | 2979 | static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, |
2919 | long seq_rtt_us, long sack_rtt_us) | 2980 | long seq_rtt_us, long sack_rtt_us, |
2981 | long ca_rtt_us) | ||
2920 | { | 2982 | { |
2921 | const struct tcp_sock *tp = tcp_sk(sk); | 2983 | const struct tcp_sock *tp = tcp_sk(sk); |
2922 | 2984 | ||
@@ -2936,11 +2998,16 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, | |||
2936 | */ | 2998 | */ |
2937 | if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && | 2999 | if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && |
2938 | flag & FLAG_ACKED) | 3000 | flag & FLAG_ACKED) |
2939 | seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr); | 3001 | seq_rtt_us = ca_rtt_us = jiffies_to_usecs(tcp_time_stamp - |
2940 | 3002 | tp->rx_opt.rcv_tsecr); | |
2941 | if (seq_rtt_us < 0) | 3003 | if (seq_rtt_us < 0) |
2942 | return false; | 3004 | return false; |
2943 | 3005 | ||
3006 | /* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is | ||
3007 | * always taken together with ACK, SACK, or TS-opts. Any negative | ||
3008 | * values will be skipped with the seq_rtt_us < 0 check above. | ||
3009 | */ | ||
3010 | tcp_update_rtt_min(sk, ca_rtt_us); | ||
2944 | tcp_rtt_estimator(sk, seq_rtt_us); | 3011 | tcp_rtt_estimator(sk, seq_rtt_us); |
2945 | tcp_set_rto(sk); | 3012 | tcp_set_rto(sk); |
2946 | 3013 | ||
@@ -2961,7 +3028,7 @@ void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req) | |||
2961 | rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack); | 3028 | rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack); |
2962 | } | 3029 | } |
2963 | 3030 | ||
2964 | tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L); | 3031 | tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us); |
2965 | } | 3032 | } |
2966 | 3033 | ||
2967 | 3034 | ||
@@ -3175,7 +3242,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | |||
3175 | ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt); | 3242 | ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt); |
3176 | } | 3243 | } |
3177 | 3244 | ||
3178 | rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us); | 3245 | rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us, |
3246 | ca_rtt_us); | ||
3179 | 3247 | ||
3180 | if (flag & FLAG_ACKED) { | 3248 | if (flag & FLAG_ACKED) { |
3181 | tcp_rearm_rto(sk); | 3249 | tcp_rearm_rto(sk); |
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 41828bdc5d32..b875c288daaa 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c | |||
@@ -470,6 +470,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, | |||
470 | 470 | ||
471 | newtp->srtt_us = 0; | 471 | newtp->srtt_us = 0; |
472 | newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); | 472 | newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); |
473 | newtp->rtt_min[0].rtt = ~0U; | ||
473 | newicsk->icsk_rto = TCP_TIMEOUT_INIT; | 474 | newicsk->icsk_rto = TCP_TIMEOUT_INIT; |
474 | 475 | ||
475 | newtp->packets_out = 0; | 476 | newtp->packets_out = 0; |