tcp: do not use cached RTT for RTT estimation

RTT cached in the TCP metrics are valuable for the initial timeout because SYN RTT usually does not account for serialization delays on low BW path. However using it to seed the RTT estimator maybe disruptive because other components (e.g., pacing) require the smooth RTT to be obtained from actual connection. The solution is to use the higher cached RTT to set the first RTO conservatively like tcp_rtt_estimator(), but avoid seeding the other RTT estimator variables such as srtt. It is also a good idea to keep RTO conservative to obtain the first RTT sample, and the performance is insured by TCP loss probe if SYN RTT is available. To keep the seeding formula consistent across SYN RTT and cached RTT, the rttvar is twice the cached RTT instead of cached RTTVAR value. The reason is because cached variation may be too small (near min RTO) which defeats the purpose of being conservative on first RTO. However the metrics still keep the RTT variations as they might be useful for user applications (through ip). Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Tested-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Yuchung Cheng <ycheng@google.com> 2013-08-30 11:35:53 -0400
committer: David S. Miller <davem@davemloft.net> 2013-08-30 15:14:38 -0400
commit: 1b7fdd2ab5852717a4fc7d79847759c67065d7e9 (patch)
tree: 6ddfcff2fb0595cb28b72a72b7c0809395c3082a /net/ipv4/tcp_metrics.c
parent: 08f89b981b0e6b46058dae2957241d4099129521 (diff)
1 files changed, 11 insertions, 33 deletions
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index f6a005c485a9..273ed735cca2 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -443,7 +443,7 @@ void tcp_init_metrics(struct sock *sk)
        struct dst_entry *dst = __sk_dst_get(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_metrics_block *tm;
-        u32 val;
+        u32 val, crtt = 0; /* cached RTT scaled by 8 */
        if (dst == NULL)
                goto reset;
@@ -478,40 +478,18 @@ void tcp_init_metrics(struct sock *sk)
                tp->reordering = val;
        }
-        val = tcp_metric_get(tm, TCP_METRIC_RTT);
+        crtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
-        if (val == 0 || tp->srtt == 0) {
-                rcu_read_unlock();
-                goto reset;
-        }
-        /* Initial rtt is determined from SYN,SYN-ACK.
-         * The segment is small and rtt may appear much
-         * less than real one. Use per-dst memory
-         * to make it more realistic.
-         *
-         * A bit of theory. RTT is time passed after "normal" sized packet
-         * is sent until it is ACKed. In normal circumstances sending small
-         * packets force peer to delay ACKs and calculation is correct too.
-         * The algorithm is adaptive and, provided we follow specs, it
-         * NEVER underestimate RTT. BUT! If peer tries to make some clever
-         * tricks sort of "quick acks" for time long enough to decrease RTT
-         * to low value, and then abruptly stops to do it and starts to delay
-         * ACKs, wait for troubles.
-         */
-        val = msecs_to_jiffies(val);
-        if (val > tp->srtt) {
-                tp->srtt = val;
-                tp->rtt_seq = tp->snd_nxt;
-        }
-        val = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
-        if (val > tp->mdev) {
-                tp->mdev = val;
-                tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
-        }
        rcu_read_unlock();
-        tcp_set_rto(sk);
 reset:
-        if (tp->srtt == 0) {
+        if (crtt > tp->srtt) {
+                /* Initial RTT (tp->srtt) from SYN usually don't measure
+                 * serialization delay on low BW links well so RTO may be
+                 * under-estimated. Stay conservative and seed RTO with
+                 * the RTTs from past data exchanges, using the same seeding
+                 * formula in tcp_rtt_estimator().
+                 */
+                inet_csk(sk)->icsk_rto = crtt + max(crtt >> 2, tcp_rto_min(sk));
+        } else if (tp->srtt == 0) {
                /* RFC6298: 5.7 We've failed to get a valid RTT sample from
                 * 3WHS. This is most likely due to retransmission,
                 * including spurious one. Reset the RTO back to 3secs
author	Yuchung Cheng <ycheng@google.com>	2013-08-30 11:35:53 -0400
committer	David S. Miller <davem@davemloft.net>	2013-08-30 15:14:38 -0400
commit	1b7fdd2ab5852717a4fc7d79847759c67065d7e9 (patch)
tree	6ddfcff2fb0595cb28b72a72b7c0809395c3082a /net/ipv4/tcp_metrics.c
parent	08f89b981b0e6b46058dae2957241d4099129521 (diff)

diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index f6a005c485a9..273ed735cca2 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c
@@ -443,7 +443,7 @@ void tcp_init_metrics(struct sock *sk)
443	struct dst_entry *dst = __sk_dst_get(sk);	443	struct dst_entry *dst = __sk_dst_get(sk);
444	struct tcp_sock *tp = tcp_sk(sk);	444	struct tcp_sock *tp = tcp_sk(sk);
445	struct tcp_metrics_block *tm;	445	struct tcp_metrics_block *tm;
446	u32 val;	446	u32 val, crtt = 0; /* cached RTT scaled by 8 */
447		447
448	if (dst == NULL)	448	if (dst == NULL)
449	goto reset;	449	goto reset;
@@ -478,40 +478,18 @@ void tcp_init_metrics(struct sock *sk)
478	tp->reordering = val;	478	tp->reordering = val;
479	}	479	}
480		480
481	val = tcp_metric_get(tm, TCP_METRIC_RTT);	481	crtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
482	if (val == 0 \|\| tp->srtt == 0) {
483	rcu_read_unlock();
484	goto reset;
485	}
486	/* Initial rtt is determined from SYN,SYN-ACK.
487	* The segment is small and rtt may appear much
488	* less than real one. Use per-dst memory
489	* to make it more realistic.
490	*
491	* A bit of theory. RTT is time passed after "normal" sized packet
492	* is sent until it is ACKed. In normal circumstances sending small
493	* packets force peer to delay ACKs and calculation is correct too.
494	* The algorithm is adaptive and, provided we follow specs, it
495	* NEVER underestimate RTT. BUT! If peer tries to make some clever
496	* tricks sort of "quick acks" for time long enough to decrease RTT
497	* to low value, and then abruptly stops to do it and starts to delay
498	* ACKs, wait for troubles.
499	*/
500	val = msecs_to_jiffies(val);
501	if (val > tp->srtt) {
502	tp->srtt = val;
503	tp->rtt_seq = tp->snd_nxt;
504	}
505	val = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
506	if (val > tp->mdev) {
507	tp->mdev = val;
508	tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
509	}
510	rcu_read_unlock();	482	rcu_read_unlock();
511
512	tcp_set_rto(sk);
513	reset:	483	reset:
514	if (tp->srtt == 0) {	484	if (crtt > tp->srtt) {
		485	/* Initial RTT (tp->srtt) from SYN usually don't measure
		486	* serialization delay on low BW links well so RTO may be
		487	* under-estimated. Stay conservative and seed RTO with
		488	* the RTTs from past data exchanges, using the same seeding
		489	* formula in tcp_rtt_estimator().
		490	*/
		491	inet_csk(sk)->icsk_rto = crtt + max(crtt >> 2, tcp_rto_min(sk));
		492	} else if (tp->srtt == 0) {
515	/* RFC6298: 5.7 We've failed to get a valid RTT sample from	493	/* RFC6298: 5.7 We've failed to get a valid RTT sample from
516	* 3WHS. This is most likely due to retransmission,	494	* 3WHS. This is most likely due to retransmission,
517	* including spurious one. Reset the RTO back to 3secs	495	* including spurious one. Reset the RTO back to 3secs