tcp: TSO packets automatic sizing

After hearing many people over past years complaining against TSO being bursty or even buggy, we are proud to present automatic sizing of TSO packets. One part of the problem is that tcp_tso_should_defer() uses an heuristic relying on upcoming ACKS instead of a timer, but more generally, having big TSO packets makes little sense for low rates, as it tends to create micro bursts on the network, and general consensus is to reduce the buffering amount. This patch introduces a per socket sk_pacing_rate, that approximates the current sending rate, and allows us to size the TSO packets so that we try to send one packet every ms. This field could be set by other transports. Patch has no impact for high speed flows, where having large TSO packets makes sense to reach line rate. For other flows, this helps better packet scheduling and ACK clocking. This patch increases performance of TCP flows in lossy environments. A new sysctl (tcp_min_tso_segs) is added, to specify the minimal size of a TSO packet (default being 2). A follow-up patch will provide a new packet scheduler (FQ), using sk_pacing_rate as an input to perform optional per flow pacing. This explains why we chose to set sk_pacing_rate to twice the current rate, allowing 'slow start' ramp up. sk_pacing_rate = 2 * cwnd * mss / srtt v2: Neal Cardwell reported a suspect deferring of last two segments on initial write of 10 MSS, I had to change tcp_tso_should_defer() to take into account tp->xmit_size_goal_segs Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Neal Cardwell <ncardwell@google.com> Cc: Yuchung Cheng <ycheng@google.com> Cc: Van Jacobson <vanj@google.com> Cc: Tom Herbert <therbert@google.com> Acked-by: Yuchung Cheng <ycheng@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Eric Dumazet <edumazet@google.com> 2013-08-27 08:46:32 -0400
committer: David S. Miller <davem@davemloft.net> 2013-08-29 15:50:06 -0400
commit: 95bd09eb27507691520d39ee1044d6ad831c1168 (patch)
tree: e05045cc6418ce08aa87d5f8c17366a7fa672f3c /net/ipv4/tcp_input.c
parent: b800c3b966bcf004bd8592293a49ed5cb7ea67a9 (diff)
1 files changed, 31 insertions, 1 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ec492eae0cd7..1a84fffe6993 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -688,6 +688,34 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
        }
 }
+/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
+ * Note: TCP stack does not yet implement pacing.
+ * FQ packet scheduler can be used to implement cheap but effective
+ * TCP pacing, to smooth the burst on large writes when packets
+ * in flight is significantly lower than cwnd (or rwin)
+ */
+static void tcp_update_pacing_rate(struct sock *sk)
+{
+        const struct tcp_sock *tp = tcp_sk(sk);
+        u64 rate;
+        /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
+        rate = (u64)tp->mss_cache * 2 * (HZ << 3);
+        rate *= max(tp->snd_cwnd, tp->packets_out);
+        /* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3),
+         * be conservative and assume srtt = 1 (125 us instead of 1.25 ms)
+         * We probably need usec resolution in the future.
+         * Note: This also takes care of possible srtt=0 case,
+         * when tcp_rtt_estimator() was not yet called.
+         */
+        if (tp->srtt > 8 + 2)
+                do_div(rate, tp->srtt);
+        sk->sk_pacing_rate = min_t(u64, rate, ~0U);
+}
 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
 * routine referred to above.
 */
@@ -3278,7 +3306,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
        u32 ack_seq = TCP_SKB_CB(skb)->seq;
        u32 ack = TCP_SKB_CB(skb)->ack_seq;
        bool is_dupack = false;
-        u32 prior_in_flight;
+        u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt;
        u32 prior_fackets;
        int prior_packets = tp->packets_out;
        const int prior_unsacked = tp->packets_out - tp->sacked_out;
@@ -3383,6 +3411,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
        if (icsk->icsk_pending == ICSK_TIME_RETRANS)
                tcp_schedule_loss_probe(sk);
+        if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd)
+                tcp_update_pacing_rate(sk);
        return 1;
 no_queue:
author	Eric Dumazet <edumazet@google.com>	2013-08-27 08:46:32 -0400
committer	David S. Miller <davem@davemloft.net>	2013-08-29 15:50:06 -0400
commit	95bd09eb27507691520d39ee1044d6ad831c1168 (patch)
tree	e05045cc6418ce08aa87d5f8c17366a7fa672f3c /net/ipv4/tcp_input.c
parent	b800c3b966bcf004bd8592293a49ed5cb7ea67a9 (diff)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ec492eae0cd7..1a84fffe6993 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c
@@ -688,6 +688,34 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
688	}	688	}
689	}	689	}
690		690
		691	/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
		692	* Note: TCP stack does not yet implement pacing.
		693	* FQ packet scheduler can be used to implement cheap but effective
		694	* TCP pacing, to smooth the burst on large writes when packets
		695	* in flight is significantly lower than cwnd (or rwin)
		696	*/
		697	static void tcp_update_pacing_rate(struct sock *sk)
		698	{
		699	const struct tcp_sock *tp = tcp_sk(sk);
		700	u64 rate;
		701
		702	/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
		703	rate = (u64)tp->mss_cache * 2 * (HZ << 3);
		704
		705	rate *= max(tp->snd_cwnd, tp->packets_out);
		706
		707	/* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3),
		708	* be conservative and assume srtt = 1 (125 us instead of 1.25 ms)
		709	* We probably need usec resolution in the future.
		710	* Note: This also takes care of possible srtt=0 case,
		711	* when tcp_rtt_estimator() was not yet called.
		712	*/
		713	if (tp->srtt > 8 + 2)
		714	do_div(rate, tp->srtt);
		715
		716	sk->sk_pacing_rate = min_t(u64, rate, ~0U);
		717	}
		718
691	/* Calculate rto without backoff. This is the second half of Van Jacobson's	719	/* Calculate rto without backoff. This is the second half of Van Jacobson's
692	* routine referred to above.	720	* routine referred to above.
693	*/	721	*/
@@ -3278,7 +3306,7 @@ static int tcp_ack(struct sock sk, const struct sk_buff skb, int flag)
3278	u32 ack_seq = TCP_SKB_CB(skb)->seq;	3306	u32 ack_seq = TCP_SKB_CB(skb)->seq;
3279	u32 ack = TCP_SKB_CB(skb)->ack_seq;	3307	u32 ack = TCP_SKB_CB(skb)->ack_seq;
3280	bool is_dupack = false;	3308	bool is_dupack = false;
3281	u32 prior_in_flight;	3309	u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt;
3282	u32 prior_fackets;	3310	u32 prior_fackets;
3283	int prior_packets = tp->packets_out;	3311	int prior_packets = tp->packets_out;
3284	const int prior_unsacked = tp->packets_out - tp->sacked_out;	3312	const int prior_unsacked = tp->packets_out - tp->sacked_out;
@@ -3383,6 +3411,8 @@ static int tcp_ack(struct sock sk, const struct sk_buff skb, int flag)
3383		3411
3384	if (icsk->icsk_pending == ICSK_TIME_RETRANS)	3412	if (icsk->icsk_pending == ICSK_TIME_RETRANS)
3385	tcp_schedule_loss_probe(sk);	3413	tcp_schedule_loss_probe(sk);
		3414	if (tp->srtt != prior_rtt \|\| tp->snd_cwnd != prior_cwnd)
		3415	tcp_update_pacing_rate(sk);
3386	return 1;	3416	return 1;
3387		3417
3388	no_queue:	3418	no_queue: