aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/networking/ip-sysctl.txt8
-rw-r--r--include/linux/tcp.h3
-rw-r--r--include/net/tcp.h7
-rw-r--r--net/ipv4/sysctl_net_ipv4.c7
-rw-r--r--net/ipv4/tcp.c1
-rw-r--r--net/ipv4/tcp_input.c78
-rw-r--r--net/ipv4/tcp_minisocks.c1
7 files changed, 100 insertions, 5 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index ebe94f2cab98..502d6a572b4f 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -384,6 +384,14 @@ tcp_mem - vector of 3 INTEGERs: min, pressure, max
384 Defaults are calculated at boot time from amount of available 384 Defaults are calculated at boot time from amount of available
385 memory. 385 memory.
386 386
387tcp_min_rtt_wlen - INTEGER
388 The window length of the windowed min filter to track the minimum RTT.
389 A shorter window lets a flow more quickly pick up new (higher)
390 minimum RTT when it is moved to a longer path (e.g., due to traffic
391 engineering). A longer window makes the filter more resistant to RTT
392 inflations such as transient congestion. The unit is seconds.
393 Default: 300
394
387tcp_moderate_rcvbuf - BOOLEAN 395tcp_moderate_rcvbuf - BOOLEAN
388 If set, TCP performs receive buffer auto-tuning, attempting to 396 If set, TCP performs receive buffer auto-tuning, attempting to
389 automatically size the buffer (no greater than tcp_rmem[2]) to 397 automatically size the buffer (no greater than tcp_rmem[2]) to
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 86a7edaa6797..90edef5508f9 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -217,6 +217,9 @@ struct tcp_sock {
217 u32 mdev_max_us; /* maximal mdev for the last rtt period */ 217 u32 mdev_max_us; /* maximal mdev for the last rtt period */
218 u32 rttvar_us; /* smoothed mdev_max */ 218 u32 rttvar_us; /* smoothed mdev_max */
219 u32 rtt_seq; /* sequence number to update rttvar */ 219 u32 rtt_seq; /* sequence number to update rttvar */
220 struct rtt_meas {
221 u32 rtt, ts; /* RTT in usec and sampling time in jiffies. */
222 } rtt_min[3];
220 223
221 u32 packets_out; /* Packets which are "in flight" */ 224 u32 packets_out; /* Packets which are "in flight" */
222 u32 retrans_out; /* Retransmitted packets out */ 225 u32 retrans_out; /* Retransmitted packets out */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index eed94fc355c1..4a43152229ea 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -279,6 +279,7 @@ extern int sysctl_tcp_limit_output_bytes;
279extern int sysctl_tcp_challenge_ack_limit; 279extern int sysctl_tcp_challenge_ack_limit;
280extern unsigned int sysctl_tcp_notsent_lowat; 280extern unsigned int sysctl_tcp_notsent_lowat;
281extern int sysctl_tcp_min_tso_segs; 281extern int sysctl_tcp_min_tso_segs;
282extern int sysctl_tcp_min_rtt_wlen;
282extern int sysctl_tcp_autocorking; 283extern int sysctl_tcp_autocorking;
283extern int sysctl_tcp_invalid_ratelimit; 284extern int sysctl_tcp_invalid_ratelimit;
284extern int sysctl_tcp_pacing_ss_ratio; 285extern int sysctl_tcp_pacing_ss_ratio;
@@ -671,6 +672,12 @@ static inline bool tcp_ca_dst_locked(const struct dst_entry *dst)
671 return dst_metric_locked(dst, RTAX_CC_ALGO); 672 return dst_metric_locked(dst, RTAX_CC_ALGO);
672} 673}
673 674
675/* Minimum RTT in usec. ~0 means not available. */
676static inline u32 tcp_min_rtt(const struct tcp_sock *tp)
677{
678 return tp->rtt_min[0].rtt;
679}
680
674/* Compute the actual receive window we are currently advertising. 681/* Compute the actual receive window we are currently advertising.
675 * Rcv_nxt can be after the window if our peer push more data 682 * Rcv_nxt can be after the window if our peer push more data
676 * than the offered window. 683 * than the offered window.
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 894da3a70aff..13ab434c2909 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -577,6 +577,13 @@ static struct ctl_table ipv4_table[] = {
577 .proc_handler = proc_dointvec 577 .proc_handler = proc_dointvec
578 }, 578 },
579 { 579 {
580 .procname = "tcp_min_rtt_wlen",
581 .data = &sysctl_tcp_min_rtt_wlen,
582 .maxlen = sizeof(int),
583 .mode = 0644,
584 .proc_handler = proc_dointvec
585 },
586 {
580 .procname = "tcp_low_latency", 587 .procname = "tcp_low_latency",
581 .data = &sysctl_tcp_low_latency, 588 .data = &sysctl_tcp_low_latency,
582 .maxlen = sizeof(int), 589 .maxlen = sizeof(int),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ac1bdbb50352..0cfa7c0c1e80 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -388,6 +388,7 @@ void tcp_init_sock(struct sock *sk)
388 388
389 icsk->icsk_rto = TCP_TIMEOUT_INIT; 389 icsk->icsk_rto = TCP_TIMEOUT_INIT;
390 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); 390 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
391 tp->rtt_min[0].rtt = ~0U;
391 392
392 /* So many TCP implementations out there (incorrectly) count the 393 /* So many TCP implementations out there (incorrectly) count the
393 * initial SYN frame in their delayed-ACK and congestion control 394 * initial SYN frame in their delayed-ACK and congestion control
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 62ee71efd1ce..eedb25db3947 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -95,6 +95,7 @@ int sysctl_tcp_stdurg __read_mostly;
95int sysctl_tcp_rfc1337 __read_mostly; 95int sysctl_tcp_rfc1337 __read_mostly;
96int sysctl_tcp_max_orphans __read_mostly = NR_FILE; 96int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
97int sysctl_tcp_frto __read_mostly = 2; 97int sysctl_tcp_frto __read_mostly = 2;
98int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
98 99
99int sysctl_tcp_thin_dupack __read_mostly; 100int sysctl_tcp_thin_dupack __read_mostly;
100 101
@@ -2915,8 +2916,69 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
2915 tcp_xmit_retransmit_queue(sk); 2916 tcp_xmit_retransmit_queue(sk);
2916} 2917}
2917 2918
2919/* Kathleen Nichols' algorithm for tracking the minimum value of
2920 * a data stream over some fixed time interval. (E.g., the minimum
2921 * RTT over the past five minutes.) It uses constant space and constant
2922 * time per update yet almost always delivers the same minimum as an
2923 * implementation that has to keep all the data in the window.
2924 *
2925 * The algorithm keeps track of the best, 2nd best & 3rd best min
2926 * values, maintaining an invariant that the measurement time of the
2927 * n'th best >= n-1'th best. It also makes sure that the three values
2928 * are widely separated in the time window since that bounds the worse
2929 * case error when that data is monotonically increasing over the window.
2930 *
2931 * Upon getting a new min, we can forget everything earlier because it
2932 * has no value - the new min is <= everything else in the window by
2933 * definition and it's the most recent. So we restart fresh on every new min
2934 * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd
2935 * best.
2936 */
2937static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
2938{
2939 const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ;
2940 struct rtt_meas *m = tcp_sk(sk)->rtt_min;
2941 struct rtt_meas rttm = { .rtt = (rtt_us ? : 1), .ts = now };
2942 u32 elapsed;
2943
2944 /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */
2945 if (unlikely(rttm.rtt <= m[0].rtt))
2946 m[0] = m[1] = m[2] = rttm;
2947 else if (rttm.rtt <= m[1].rtt)
2948 m[1] = m[2] = rttm;
2949 else if (rttm.rtt <= m[2].rtt)
2950 m[2] = rttm;
2951
2952 elapsed = now - m[0].ts;
2953 if (unlikely(elapsed > wlen)) {
2954 /* Passed entire window without a new min so make 2nd choice
2955 * the new min & 3rd choice the new 2nd. So forth and so on.
2956 */
2957 m[0] = m[1];
2958 m[1] = m[2];
2959 m[2] = rttm;
2960 if (now - m[0].ts > wlen) {
2961 m[0] = m[1];
2962 m[1] = rttm;
2963 if (now - m[0].ts > wlen)
2964 m[0] = rttm;
2965 }
2966 } else if (m[1].ts == m[0].ts && elapsed > wlen / 4) {
2967 /* Passed a quarter of the window without a new min so
2968 * take 2nd choice from the 2nd quarter of the window.
2969 */
2970 m[2] = m[1] = rttm;
2971 } else if (m[2].ts == m[1].ts && elapsed > wlen / 2) {
2972 /* Passed half the window without a new min so take the 3rd
2973 * choice from the last half of the window.
2974 */
2975 m[2] = rttm;
2976 }
2977}
2978
2918static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, 2979static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
2919 long seq_rtt_us, long sack_rtt_us) 2980 long seq_rtt_us, long sack_rtt_us,
2981 long ca_rtt_us)
2920{ 2982{
2921 const struct tcp_sock *tp = tcp_sk(sk); 2983 const struct tcp_sock *tp = tcp_sk(sk);
2922 2984
@@ -2936,11 +2998,16 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
2936 */ 2998 */
2937 if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && 2999 if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
2938 flag & FLAG_ACKED) 3000 flag & FLAG_ACKED)
2939 seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr); 3001 seq_rtt_us = ca_rtt_us = jiffies_to_usecs(tcp_time_stamp -
2940 3002 tp->rx_opt.rcv_tsecr);
2941 if (seq_rtt_us < 0) 3003 if (seq_rtt_us < 0)
2942 return false; 3004 return false;
2943 3005
3006 /* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is
3007 * always taken together with ACK, SACK, or TS-opts. Any negative
3008 * values will be skipped with the seq_rtt_us < 0 check above.
3009 */
3010 tcp_update_rtt_min(sk, ca_rtt_us);
2944 tcp_rtt_estimator(sk, seq_rtt_us); 3011 tcp_rtt_estimator(sk, seq_rtt_us);
2945 tcp_set_rto(sk); 3012 tcp_set_rto(sk);
2946 3013
@@ -2961,7 +3028,7 @@ void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
2961 rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack); 3028 rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack);
2962 } 3029 }
2963 3030
2964 tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L); 3031 tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us);
2965} 3032}
2966 3033
2967 3034
@@ -3175,7 +3242,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3175 ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt); 3242 ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt);
3176 } 3243 }
3177 3244
3178 rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us); 3245 rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
3246 ca_rtt_us);
3179 3247
3180 if (flag & FLAG_ACKED) { 3248 if (flag & FLAG_ACKED) {
3181 tcp_rearm_rto(sk); 3249 tcp_rearm_rto(sk);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 41828bdc5d32..b875c288daaa 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -470,6 +470,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
470 470
471 newtp->srtt_us = 0; 471 newtp->srtt_us = 0;
472 newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); 472 newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
473 newtp->rtt_min[0].rtt = ~0U;
473 newicsk->icsk_rto = TCP_TIMEOUT_INIT; 474 newicsk->icsk_rto = TCP_TIMEOUT_INIT;
474 475
475 newtp->packets_out = 0; 476 newtp->packets_out = 0;