aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2014-02-06 18:57:10 -0500
committerDavid S. Miller <davem@davemloft.net>2014-02-07 00:28:06 -0500
commit4a5ab4e224288403b0b4b6b8c4d339323150c312 (patch)
treeafee288be08ddcb19baf988e022cc1394cf9ce83
parentdbe173079ab58a444e12dbebe96f5aec1e0bed1a (diff)
tcp: remove 1ms offset in srtt computation
TCP pacing depends on an accurate srtt estimation. Current srtt estimation is using jiffie resolution, and has an artificial offset of at least 1 ms, which can produce slowdowns when FQ/pacing is used, especially in DC world, where typical rtt is below 1 ms. We are planning a switch to usec resolution for linux-3.15, but in the meantime, this patch removes the 1 ms offset. All we need is to have tp->srtt minimal value of 1 to differentiate the case of srtt being initialized or not, not 8. The problematic behavior was observed on a 40Gbit testbed, where 32 concurrent netperf were reaching 12Gbps of aggregate speed, instead of line speed. This patch also has the effect of reporting more accurate srtt and send rates to iproute2 ss command as in : $ ss -i dst cca2 Netid State Recv-Q Send-Q Local Address:Port Peer Address:Port tcp ESTAB 0 0 10.244.129.1:56984 10.244.129.2:12865 cubic wscale:6,6 rto:200 rtt:0.25/0.25 ato:40 mss:1448 cwnd:10 send 463.4Mbps rcv_rtt:1 rcv_space:29200 tcp ESTAB 0 390960 10.244.129.1:60247 10.244.129.2:50204 cubic wscale:6,6 rto:200 rtt:0.875/0.75 mss:1448 cwnd:73 ssthresh:51 send 966.4Mbps unacked:73 retrans:0/121 rcv_space:29200 Reported-by: Vytautas Valancius <valas@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Yuchung Cheng <ycheng@google.com> Cc: Neal Cardwell <ncardwell@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--net/ipv4/tcp_input.c18
-rw-r--r--net/ipv4/tcp_output.c2
2 files changed, 11 insertions, 9 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 65cf90e063d5..227cba79fa6b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -671,6 +671,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
671{ 671{
672 struct tcp_sock *tp = tcp_sk(sk); 672 struct tcp_sock *tp = tcp_sk(sk);
673 long m = mrtt; /* RTT */ 673 long m = mrtt; /* RTT */
674 u32 srtt = tp->srtt;
674 675
675 /* The following amusing code comes from Jacobson's 676 /* The following amusing code comes from Jacobson's
676 * article in SIGCOMM '88. Note that rtt and mdev 677 * article in SIGCOMM '88. Note that rtt and mdev
@@ -688,11 +689,9 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
688 * does not matter how to _calculate_ it. Seems, it was trap 689 * does not matter how to _calculate_ it. Seems, it was trap
689 * that VJ failed to avoid. 8) 690 * that VJ failed to avoid. 8)
690 */ 691 */
691 if (m == 0) 692 if (srtt != 0) {
692 m = 1; 693 m -= (srtt >> 3); /* m is now error in rtt est */
693 if (tp->srtt != 0) { 694 srtt += m; /* rtt = 7/8 rtt + 1/8 new */
694 m -= (tp->srtt >> 3); /* m is now error in rtt est */
695 tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */
696 if (m < 0) { 695 if (m < 0) {
697 m = -m; /* m is now abs(error) */ 696 m = -m; /* m is now abs(error) */
698 m -= (tp->mdev >> 2); /* similar update on mdev */ 697 m -= (tp->mdev >> 2); /* similar update on mdev */
@@ -723,11 +722,12 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
723 } 722 }
724 } else { 723 } else {
725 /* no previous measure. */ 724 /* no previous measure. */
726 tp->srtt = m << 3; /* take the measured time to be rtt */ 725 srtt = m << 3; /* take the measured time to be rtt */
727 tp->mdev = m << 1; /* make sure rto = 3*rtt */ 726 tp->mdev = m << 1; /* make sure rto = 3*rtt */
728 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); 727 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
729 tp->rtt_seq = tp->snd_nxt; 728 tp->rtt_seq = tp->snd_nxt;
730 } 729 }
730 tp->srtt = max(1U, srtt);
731} 731}
732 732
733/* Set the sk_pacing_rate to allow proper sizing of TSO packets. 733/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
@@ -746,8 +746,10 @@ static void tcp_update_pacing_rate(struct sock *sk)
746 746
747 rate *= max(tp->snd_cwnd, tp->packets_out); 747 rate *= max(tp->snd_cwnd, tp->packets_out);
748 748
749 /* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3), 749 /* Correction for small srtt and scheduling constraints.
750 * be conservative and assume srtt = 1 (125 us instead of 1.25 ms) 750 * For small rtt, consider noise is too high, and use
751 * the minimal value (srtt = 1 -> 125 us for HZ=1000)
752 *
751 * We probably need usec resolution in the future. 753 * We probably need usec resolution in the future.
752 * Note: This also takes care of possible srtt=0 case, 754 * Note: This also takes care of possible srtt=0 case,
753 * when tcp_rtt_estimator() was not yet called. 755 * when tcp_rtt_estimator() was not yet called.
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 03d26b85eab8..10435b3b9d0f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1977,7 +1977,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
1977 /* Schedule a loss probe in 2*RTT for SACK capable connections 1977 /* Schedule a loss probe in 2*RTT for SACK capable connections
1978 * in Open state, that are either limited by cwnd or application. 1978 * in Open state, that are either limited by cwnd or application.
1979 */ 1979 */
1980 if (sysctl_tcp_early_retrans < 3 || !rtt || !tp->packets_out || 1980 if (sysctl_tcp_early_retrans < 3 || !tp->srtt || !tp->packets_out ||
1981 !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) 1981 !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
1982 return false; 1982 return false;
1983 1983