tcp: refine TSO autosizing

Commit 95bd09eb2750 ("tcp: TSO packets automatic sizing") tried to control TSO size, but did this at the wrong place (sendmsg() time) At sendmsg() time, we might have a pessimistic view of flow rate, and we end up building very small skbs (with 2 MSS per skb). This is bad because : - It sends small TSO packets even in Slow Start where rate quickly increases. - It tends to make socket write queue very big, increasing tcp_ack() processing time, but also increasing memory needs, not necessarily accounted for, as fast clones overhead is currently ignored. - Lower GRO efficiency and more ACK packets. Servers with a lot of small lived connections suffer from this. Lets instead fill skbs as much as possible (64KB of payload), but split them at xmit time, when we have a precise idea of the flow rate. skb split is actually quite efficient. Patch looks bigger than necessary, because TCP Small Queue decision now has to take place after the eventual split. As Neal suggested, introduce a new tcp_tso_autosize() helper, so that tcp_tso_should_defer() can be synchronized on same goal. Rename tp->xmit_size_goal_segs to tp->gso_segs, as this variable contains number of mss that we can put in GSO packet, and is not related to the autosizing goal anymore. Tested: 40 ms rtt link nstat >/dev/null netperf -H remote -l -2000000 -- -s 1000000 nstat | egrep "IpInReceives|IpOutRequests|TcpOutSegs|IpExtOutOctets" Before patch : Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/s 87380 2000000 2000000 0.36 44.22 IpInReceives 600 0.0 IpOutRequests 599 0.0 TcpOutSegs 1397 0.0 IpExtOutOctets 2033249 0.0 After patch : Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 87380 2000000 2000000 0.36 44.27 IpInReceives 221 0.0 IpOutRequests 232 0.0 TcpOutSegs 1397 0.0 IpExtOutOctets 2013953 0.0 Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Acked-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Eric Dumazet <edumazet@google.com> 2014-12-07 15:22:18 -0500
committer: David S. Miller <davem@davemloft.net> 2014-12-09 16:39:22 -0500
commit: 605ad7f184b60cfaacbc038aa6c55ee68dee3c89 (patch)
tree: e4c88937452f13283365fdcd4d1b5a900c6084a7 /net/ipv4
parent: 5e84e189ce1323978afebfba89d3c18cd3f3643c (diff)
2 files changed, 62 insertions, 57 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index dc13a3657e8e..427aee33ffc0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -835,47 +835,29 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
                                       int large_allowed)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-        u32 xmit_size_goal, old_size_goal;
+        u32 new_size_goal, size_goal, hlen;
-        xmit_size_goal = mss_now;
+        if (!large_allowed || !sk_can_gso(sk))
+                return mss_now;
-        if (large_allowed && sk_can_gso(sk)) {
-                u32 gso_size, hlen;
+        /* Maybe we should/could use sk->sk_prot->max_header here ? */
+        hlen = inet_csk(sk)->icsk_af_ops->net_header_len +
-                /* Maybe we should/could use sk->sk_prot->max_header here ? */
+               inet_csk(sk)->icsk_ext_hdr_len +
-                hlen = inet_csk(sk)->icsk_af_ops->net_header_len +
+               tp->tcp_header_len;
-                       inet_csk(sk)->icsk_ext_hdr_len +
-                       tp->tcp_header_len;
+        new_size_goal = sk->sk_gso_max_size - 1 - hlen;
+        new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
-                /* Goal is to send at least one packet per ms,
-                 * not one big TSO packet every 100 ms.
+        /* We try hard to avoid divides here */
-                 * This preserves ACK clocking and is consistent
+        size_goal = tp->gso_segs * mss_now;
-                 * with tcp_tso_should_defer() heuristic.
+        if (unlikely(new_size_goal < size_goal ||
-                 */
+                     new_size_goal >= size_goal + mss_now)) {
-                gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC);
+                tp->gso_segs = min_t(u16, new_size_goal / mss_now,
-                gso_size = max_t(u32, gso_size,
+                                     sk->sk_gso_max_segs);
-                                 sysctl_tcp_min_tso_segs * mss_now);
+                size_goal = tp->gso_segs * mss_now;
-                xmit_size_goal = min_t(u32, gso_size,
-                                       sk->sk_gso_max_size - 1 - hlen);
-                xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
-                /* We try hard to avoid divides here */
-                old_size_goal = tp->xmit_size_goal_segs * mss_now;
-                if (likely(old_size_goal <= xmit_size_goal &&
-                           old_size_goal + mss_now > xmit_size_goal)) {
-                        xmit_size_goal = old_size_goal;
-                } else {
-                        tp->xmit_size_goal_segs =
-                                min_t(u16, xmit_size_goal / mss_now,
-                                      sk->sk_gso_max_segs);
-                        xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
-                }
        }
-        return max(xmit_size_goal, mss_now);
+        return max(size_goal, mss_now);
 }
 static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f5bd4bd3f7e6..f37ecf53ee8a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1524,6 +1524,27 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
                ((nonagle & TCP_NAGLE_CORK) ||
                 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
 }
+/* Return how many segs we'd like on a TSO packet,
+ * to send one TSO packet per ms
+ */
+static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now)
+{
+        u32 bytes, segs;
+        bytes = min(sk->sk_pacing_rate >> 10,
+                    sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
+        /* Goal is to send at least one packet per ms,
+         * not one big TSO packet every 100 ms.
+         * This preserves ACK clocking and is consistent
+         * with tcp_tso_should_defer() heuristic.
+         */
+        segs = max_t(u32, bytes / mss_now, sysctl_tcp_min_tso_segs);
+        return min_t(u32, segs, sk->sk_gso_max_segs);
+}
 /* Returns the portion of skb which can be sent right away */
 static unsigned int tcp_mss_split_point(const struct sock *sk,
                                        const struct sk_buff *skb,
@@ -1731,7 +1752,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 * This algorithm is from John Heffner.
 */
 static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
-                                 bool *is_cwnd_limited)
+                                 bool *is_cwnd_limited, u32 max_segs)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1761,8 +1782,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
        limit = min(send_win, cong_win);
        /* If a full-sized TSO skb can be sent, do it. */
-        if (limit >= min_t(unsigned int, sk->sk_gso_max_size,
+        if (limit >= max_segs * tp->mss_cache)
-                           tp->xmit_size_goal_segs * tp->mss_cache))
                goto send_now;
        /* Middle in queue won't get any more data, full sendable already? */
@@ -1959,6 +1979,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
        int cwnd_quota;
        int result;
        bool is_cwnd_limited = false;
+        u32 max_segs;
        sent_pkts = 0;
@@ -1972,6 +1993,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
                }
        }
+        max_segs = tcp_tso_autosize(sk, mss_now);
        while ((skb = tcp_send_head(sk))) {
                unsigned int limit;
@@ -2004,10 +2026,23 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
                                break;
                } else {
                        if (!push_one &&
-                            tcp_tso_should_defer(sk, skb, &is_cwnd_limited))
+                            tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
+                                                 max_segs))
                                break;
                }
+                limit = mss_now;
+                if (tso_segs > 1 && !tcp_urg_mode(tp))
+                        limit = tcp_mss_split_point(sk, skb, mss_now,
+                                                    min_t(unsigned int,
+                                                          cwnd_quota,
+                                                          max_segs),
+                                                    nonagle);
+                if (skb->len > limit &&
+                    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
+                        break;
                /* TCP Small Queues :
                 * Control number of packets in qdisc/devices to two packets / or ~1 ms.
                 * This allows for :
@@ -2018,8 +2053,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
                 * of queued bytes to ensure line rate.
                 * One example is wifi aggregation (802.11 AMPDU)
                 */
-                limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes,
+                limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
-                              sk->sk_pacing_rate >> 10);
+                limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
                if (atomic_read(&sk->sk_wmem_alloc) > limit) {
                        set_bit(TSQ_THROTTLED, &tp->tsq_flags);
@@ -2032,18 +2067,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
                                break;
                }
-                limit = mss_now;
-                if (tso_segs > 1 && !tcp_urg_mode(tp))
-                        limit = tcp_mss_split_point(sk, skb, mss_now,
-                                                    min_t(unsigned int,
-                                                          cwnd_quota,
-                                                          sk->sk_gso_max_segs),
-                                                    nonagle);
-                if (skb->len > limit &&
-                    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
-                        break;
                if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
                        break;
author	Eric Dumazet <edumazet@google.com>	2014-12-07 15:22:18 -0500
committer	David S. Miller <davem@davemloft.net>	2014-12-09 16:39:22 -0500
commit	605ad7f184b60cfaacbc038aa6c55ee68dee3c89 (patch)
tree	e4c88937452f13283365fdcd4d1b5a900c6084a7 /net/ipv4
parent	5e84e189ce1323978afebfba89d3c18cd3f3643c (diff)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index dc13a3657e8e..427aee33ffc0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c
@@ -835,47 +835,29 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
835	int large_allowed)	835	int large_allowed)
836	{	836	{
837	struct tcp_sock *tp = tcp_sk(sk);	837	struct tcp_sock *tp = tcp_sk(sk);
838	u32 xmit_size_goal, old_size_goal;	838	u32 new_size_goal, size_goal, hlen;
839		839
840	xmit_size_goal = mss_now;	840	if (!large_allowed \|\| !sk_can_gso(sk))
841		841	return mss_now;
842	if (large_allowed && sk_can_gso(sk)) {	842
843	u32 gso_size, hlen;	843	/* Maybe we should/could use sk->sk_prot->max_header here ? */
844		844	hlen = inet_csk(sk)->icsk_af_ops->net_header_len +
845	/* Maybe we should/could use sk->sk_prot->max_header here ? */	845	inet_csk(sk)->icsk_ext_hdr_len +
846	hlen = inet_csk(sk)->icsk_af_ops->net_header_len +	846	tp->tcp_header_len;
847	inet_csk(sk)->icsk_ext_hdr_len +	847
848	tp->tcp_header_len;	848	new_size_goal = sk->sk_gso_max_size - 1 - hlen;
849		849	new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
850	/* Goal is to send at least one packet per ms,	850
851	* not one big TSO packet every 100 ms.	851	/* We try hard to avoid divides here */
852	* This preserves ACK clocking and is consistent	852	size_goal = tp->gso_segs * mss_now;
853	* with tcp_tso_should_defer() heuristic.	853	if (unlikely(new_size_goal < size_goal \|\|
854	*/	854	new_size_goal >= size_goal + mss_now)) {
855	gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC);	855	tp->gso_segs = min_t(u16, new_size_goal / mss_now,
856	gso_size = max_t(u32, gso_size,	856	sk->sk_gso_max_segs);
857	sysctl_tcp_min_tso_segs * mss_now);	857	size_goal = tp->gso_segs * mss_now;
858
859	xmit_size_goal = min_t(u32, gso_size,
860	sk->sk_gso_max_size - 1 - hlen);
861
862	xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
863
864	/* We try hard to avoid divides here */
865	old_size_goal = tp->xmit_size_goal_segs * mss_now;
866
867	if (likely(old_size_goal <= xmit_size_goal &&
868	old_size_goal + mss_now > xmit_size_goal)) {
869	xmit_size_goal = old_size_goal;
870	} else {
871	tp->xmit_size_goal_segs =
872	min_t(u16, xmit_size_goal / mss_now,
873	sk->sk_gso_max_segs);
874	xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
875	}
876	}	858	}
877		859
878	return max(xmit_size_goal, mss_now);	860	return max(size_goal, mss_now);
879	}	861	}
880		862
881	static int tcp_send_mss(struct sock sk, int size_goal, int flags)	863	static int tcp_send_mss(struct sock sk, int size_goal, int flags)


diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f5bd4bd3f7e6..f37ecf53ee8a 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c
@@ -1524,6 +1524,27 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
1524	((nonagle & TCP_NAGLE_CORK) \|\|	1524	((nonagle & TCP_NAGLE_CORK) \|\|
1525	(!nonagle && tp->packets_out && tcp_minshall_check(tp)));	1525	(!nonagle && tp->packets_out && tcp_minshall_check(tp)));
1526	}	1526	}
		1527
		1528	/* Return how many segs we'd like on a TSO packet,
		1529	* to send one TSO packet per ms
		1530	*/
		1531	static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now)
		1532	{
		1533	u32 bytes, segs;
		1534
		1535	bytes = min(sk->sk_pacing_rate >> 10,
		1536	sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
		1537
		1538	/* Goal is to send at least one packet per ms,
		1539	* not one big TSO packet every 100 ms.
		1540	* This preserves ACK clocking and is consistent
		1541	* with tcp_tso_should_defer() heuristic.
		1542	*/
		1543	segs = max_t(u32, bytes / mss_now, sysctl_tcp_min_tso_segs);
		1544
		1545	return min_t(u32, segs, sk->sk_gso_max_segs);
		1546	}
		1547
1527	/* Returns the portion of skb which can be sent right away */	1548	/* Returns the portion of skb which can be sent right away */
1528	static unsigned int tcp_mss_split_point(const struct sock *sk,	1549	static unsigned int tcp_mss_split_point(const struct sock *sk,
1529	const struct sk_buff *skb,	1550	const struct sk_buff *skb,
@@ -1731,7 +1752,7 @@ static int tso_fragment(struct sock sk, struct sk_buff skb, unsigned int len,
1731	* This algorithm is from John Heffner.	1752	* This algorithm is from John Heffner.
1732	*/	1753	*/
1733	static bool tcp_tso_should_defer(struct sock sk, struct sk_buff skb,	1754	static bool tcp_tso_should_defer(struct sock sk, struct sk_buff skb,
1734	bool *is_cwnd_limited)	1755	bool *is_cwnd_limited, u32 max_segs)
1735	{	1756	{
1736	struct tcp_sock *tp = tcp_sk(sk);	1757	struct tcp_sock *tp = tcp_sk(sk);
1737	const struct inet_connection_sock *icsk = inet_csk(sk);	1758	const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1761,8 +1782,7 @@ static bool tcp_tso_should_defer(struct sock sk, struct sk_buff skb,
1761	limit = min(send_win, cong_win);	1782	limit = min(send_win, cong_win);
1762		1783
1763	/* If a full-sized TSO skb can be sent, do it. */	1784	/* If a full-sized TSO skb can be sent, do it. */
1764	if (limit >= min_t(unsigned int, sk->sk_gso_max_size,	1785	if (limit >= max_segs * tp->mss_cache)
1765	tp->xmit_size_goal_segs * tp->mss_cache))
1766	goto send_now;	1786	goto send_now;
1767		1787
1768	/* Middle in queue won't get any more data, full sendable already? */	1788	/* Middle in queue won't get any more data, full sendable already? */
@@ -1959,6 +1979,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1959	int cwnd_quota;	1979	int cwnd_quota;
1960	int result;	1980	int result;
1961	bool is_cwnd_limited = false;	1981	bool is_cwnd_limited = false;
		1982	u32 max_segs;
1962		1983
1963	sent_pkts = 0;	1984	sent_pkts = 0;
1964		1985
@@ -1972,6 +1993,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1972	}	1993	}
1973	}	1994	}
1974		1995
		1996	max_segs = tcp_tso_autosize(sk, mss_now);
1975	while ((skb = tcp_send_head(sk))) {	1997	while ((skb = tcp_send_head(sk))) {
1976	unsigned int limit;	1998	unsigned int limit;
1977		1999
@@ -2004,10 +2026,23 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2004	break;	2026	break;
2005	} else {	2027	} else {
2006	if (!push_one &&	2028	if (!push_one &&
2007	tcp_tso_should_defer(sk, skb, &is_cwnd_limited))	2029	tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
		2030	max_segs))
2008	break;	2031	break;
2009	}	2032	}
2010		2033
		2034	limit = mss_now;
		2035	if (tso_segs > 1 && !tcp_urg_mode(tp))
		2036	limit = tcp_mss_split_point(sk, skb, mss_now,
		2037	min_t(unsigned int,
		2038	cwnd_quota,
		2039	max_segs),
		2040	nonagle);
		2041
		2042	if (skb->len > limit &&
		2043	unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
		2044	break;
		2045
2011	/* TCP Small Queues :	2046	/* TCP Small Queues :
2012	* Control number of packets in qdisc/devices to two packets / or ~1 ms.	2047	* Control number of packets in qdisc/devices to two packets / or ~1 ms.
2013	* This allows for :	2048	* This allows for :
@@ -2018,8 +2053,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2018	* of queued bytes to ensure line rate.	2053	* of queued bytes to ensure line rate.
2019	* One example is wifi aggregation (802.11 AMPDU)	2054	* One example is wifi aggregation (802.11 AMPDU)
2020	*/	2055	*/
2021	limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes,	2056	limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
2022	sk->sk_pacing_rate >> 10);	2057	limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
2023		2058
2024	if (atomic_read(&sk->sk_wmem_alloc) > limit) {	2059	if (atomic_read(&sk->sk_wmem_alloc) > limit) {
2025	set_bit(TSQ_THROTTLED, &tp->tsq_flags);	2060	set_bit(TSQ_THROTTLED, &tp->tsq_flags);
@@ -2032,18 +2067,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2032	break;	2067	break;
2033	}	2068	}
2034		2069
2035	limit = mss_now;
2036	if (tso_segs > 1 && !tcp_urg_mode(tp))
2037	limit = tcp_mss_split_point(sk, skb, mss_now,
2038	min_t(unsigned int,
2039	cwnd_quota,
2040	sk->sk_gso_max_segs),
2041	nonagle);
2042
2043	if (skb->len > limit &&
2044	unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
2045	break;
2046
2047	if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))	2070	if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
2048	break;	2071	break;
2049		2072