7 files changed, 77 insertions, 7 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index a2be556032c9..1cb3aeb4baff 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -482,6 +482,15 @@ tcp_syn_retries - INTEGER
 tcp_timestamps - BOOLEAN
        Enable timestamps as defined in RFC1323.
+tcp_min_tso_segs - INTEGER
+        Minimal number of segments per TSO frame.
+        Since linux-3.12, TCP does an automatic sizing of TSO frames,
+        depending on flow rate, instead of filling 64Kbytes packets.
+        For specific usages, it's possible to force TCP to build big
+        TSO frames. Note that TCP stack might split too big TSO packets
+        if available window is too small.
+        Default: 2
 tcp_tso_win_divisor - INTEGER
        This allows control over what percentage of the congestion window
        can be consumed by a single TSO frame.
diff --git a/include/net/sock.h b/include/net/sock.h
index e4bbcbfd07ea..6ba2e7b0e2b1 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -232,6 +232,7 @@ struct cg_proto;
  *     @sk_napi_id: id of the last napi context to receive data for sk
  *     @sk_ll_usec: usecs to busypoll when there is no data
  *     @sk_allocation: allocation mode
+  *     @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler)
  *     @sk_sndbuf: size of send buffer in bytes
  *     @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
  *                %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
@@ -361,6 +362,7 @@ struct sock {
        kmemcheck_bitfield_end(flags);
        int                     sk_wmem_queued;
        gfp_t                   sk_allocation;
+        u32                     sk_pacing_rate; /* bytes per second */
        netdev_features_t       sk_route_caps;
        netdev_features_t       sk_route_nocaps;
        int                     sk_gso_type;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index dd5e16f66f84..6a6a88db462d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -281,6 +281,7 @@ extern int sysctl_tcp_early_retrans;
 extern int sysctl_tcp_limit_output_bytes;
 extern int sysctl_tcp_challenge_ack_limit;
 extern unsigned int sysctl_tcp_notsent_lowat;
+extern int sysctl_tcp_min_tso_segs;
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 8ed7c32ae28e..540279f4c531 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -29,6 +29,7 @@
 static int zero;
 static int one = 1;
 static int four = 4;
+static int gso_max_segs = GSO_MAX_SEGS;
 static int tcp_retr1_max = 255;
 static int ip_local_port_range_min[] = { 1, 1 };
 static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -761,6 +762,15 @@ static struct ctl_table ipv4_table[] = {
                .extra2         = &four,
        },
        {
+                .procname       = "tcp_min_tso_segs",
+                .data           = &sysctl_tcp_min_tso_segs,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
+                .extra2         = &gso_max_segs,
+        },
+        {
                .procname       = "udp_mem",
                .data           = &sysctl_udp_mem,
                .maxlen         = sizeof(sysctl_udp_mem),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4e42c03859f4..fdf74090a001 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -283,6 +283,8 @@
 int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
+int sysctl_tcp_min_tso_segs __read_mostly = 2;
 struct percpu_counter tcp_orphan_count;
 EXPORT_SYMBOL_GPL(tcp_orphan_count);
@@ -785,12 +787,28 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
        xmit_size_goal = mss_now;
        if (large_allowed && sk_can_gso(sk)) {
-                xmit_size_goal = ((sk->sk_gso_max_size - 1) -
+                u32 gso_size, hlen;
-                                  inet_csk(sk)->icsk_af_ops->net_header_len -
-                                  inet_csk(sk)->icsk_ext_hdr_len -
+                /* Maybe we should/could use sk->sk_prot->max_header here ? */
-                                  tp->tcp_header_len);
+                hlen = inet_csk(sk)->icsk_af_ops->net_header_len +
+                       inet_csk(sk)->icsk_ext_hdr_len +
+                       tp->tcp_header_len;
+                /* Goal is to send at least one packet per ms,
+                 * not one big TSO packet every 100 ms.
+                 * This preserves ACK clocking and is consistent
+                 * with tcp_tso_should_defer() heuristic.
+                 */
+                gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC);
+                gso_size = max_t(u32, gso_size,
+                                 sysctl_tcp_min_tso_segs * mss_now);
+                xmit_size_goal = min_t(u32, gso_size,
+                                       sk->sk_gso_max_size - 1 - hlen);
-                /* TSQ : try to have two TSO segments in flight */
+                /* TSQ : try to have at least two segments in flight
+                 * (one in NIC TX ring, another in Qdisc)
+                 */
                xmit_size_goal = min_t(u32, xmit_size_goal,
                                       sysctl_tcp_limit_output_bytes >> 1);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ec492eae0cd7..1a84fffe6993 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -688,6 +688,34 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
        }
 }
+/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
+ * Note: TCP stack does not yet implement pacing.
+ * FQ packet scheduler can be used to implement cheap but effective
+ * TCP pacing, to smooth the burst on large writes when packets
+ * in flight is significantly lower than cwnd (or rwin)
+ */
+static void tcp_update_pacing_rate(struct sock *sk)
+{
+        const struct tcp_sock *tp = tcp_sk(sk);
+        u64 rate;
+        /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
+        rate = (u64)tp->mss_cache * 2 * (HZ << 3);
+        rate *= max(tp->snd_cwnd, tp->packets_out);
+        /* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3),
+         * be conservative and assume srtt = 1 (125 us instead of 1.25 ms)
+         * We probably need usec resolution in the future.
+         * Note: This also takes care of possible srtt=0 case,
+         * when tcp_rtt_estimator() was not yet called.
+         */
+        if (tp->srtt > 8 + 2)
+                do_div(rate, tp->srtt);
+        sk->sk_pacing_rate = min_t(u64, rate, ~0U);
+}
 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
 * routine referred to above.
 */
@@ -3278,7 +3306,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
        u32 ack_seq = TCP_SKB_CB(skb)->seq;
        u32 ack = TCP_SKB_CB(skb)->ack_seq;
        bool is_dupack = false;
-        u32 prior_in_flight;
+        u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt;
        u32 prior_fackets;
        int prior_packets = tp->packets_out;
        const int prior_unsacked = tp->packets_out - tp->sacked_out;
@@ -3383,6 +3411,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
        if (icsk->icsk_pending == ICSK_TIME_RETRANS)
                tcp_schedule_loss_probe(sk);
+        if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd)
+                tcp_update_pacing_rate(sk);
        return 1;
 no_queue:
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 884efff5b531..e63ae4c9691d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1631,7 +1631,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
        /* If a full-sized TSO skb can be sent, do it. */
        if (limit >= min_t(unsigned int, sk->sk_gso_max_size,
-                           sk->sk_gso_max_segs * tp->mss_cache))
+                           tp->xmit_size_goal_segs * tp->mss_cache))
                goto send_now;
        /* Middle in queue won't get any more data, full sendable already? */

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index a2be556032c9..1cb3aeb4baff 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt
@@ -482,6 +482,15 @@ tcp_syn_retries - INTEGER
482	tcp_timestamps - BOOLEAN	482	tcp_timestamps - BOOLEAN
483	Enable timestamps as defined in RFC1323.	483	Enable timestamps as defined in RFC1323.
484		484
		485	tcp_min_tso_segs - INTEGER
		486	Minimal number of segments per TSO frame.
		487	Since linux-3.12, TCP does an automatic sizing of TSO frames,
		488	depending on flow rate, instead of filling 64Kbytes packets.
		489	For specific usages, it's possible to force TCP to build big
		490	TSO frames. Note that TCP stack might split too big TSO packets
		491	if available window is too small.
		492	Default: 2
		493
485	tcp_tso_win_divisor - INTEGER	494	tcp_tso_win_divisor - INTEGER
486	This allows control over what percentage of the congestion window	495	This allows control over what percentage of the congestion window
487	can be consumed by a single TSO frame.	496	can be consumed by a single TSO frame.


diff --git a/include/net/sock.h b/include/net/sock.h index e4bbcbfd07ea..6ba2e7b0e2b1 100644 --- a/include/net/sock.h +++ b/include/net/sock.h
@@ -232,6 +232,7 @@ struct cg_proto;
232	* @sk_napi_id: id of the last napi context to receive data for sk	232	* @sk_napi_id: id of the last napi context to receive data for sk
233	* @sk_ll_usec: usecs to busypoll when there is no data	233	* @sk_ll_usec: usecs to busypoll when there is no data
234	* @sk_allocation: allocation mode	234	* @sk_allocation: allocation mode
		235	* @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler)
235	* @sk_sndbuf: size of send buffer in bytes	236	* @sk_sndbuf: size of send buffer in bytes
236	* @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,	237	* @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
237	* %SO_OOBINLINE settings, %SO_TIMESTAMPING settings	238	* %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
@@ -361,6 +362,7 @@ struct sock {
361	kmemcheck_bitfield_end(flags);	362	kmemcheck_bitfield_end(flags);
362	int sk_wmem_queued;	363	int sk_wmem_queued;
363	gfp_t sk_allocation;	364	gfp_t sk_allocation;
		365	u32 sk_pacing_rate; /* bytes per second */
364	netdev_features_t sk_route_caps;	366	netdev_features_t sk_route_caps;
365	netdev_features_t sk_route_nocaps;	367	netdev_features_t sk_route_nocaps;
366	int sk_gso_type;	368	int sk_gso_type;


diff --git a/include/net/tcp.h b/include/net/tcp.h index dd5e16f66f84..6a6a88db462d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h
@@ -281,6 +281,7 @@ extern int sysctl_tcp_early_retrans;
281	extern int sysctl_tcp_limit_output_bytes;	281	extern int sysctl_tcp_limit_output_bytes;
282	extern int sysctl_tcp_challenge_ack_limit;	282	extern int sysctl_tcp_challenge_ack_limit;
283	extern unsigned int sysctl_tcp_notsent_lowat;	283	extern unsigned int sysctl_tcp_notsent_lowat;
		284	extern int sysctl_tcp_min_tso_segs;
284		285
285	extern atomic_long_t tcp_memory_allocated;	286	extern atomic_long_t tcp_memory_allocated;
286	extern struct percpu_counter tcp_sockets_allocated;	287	extern struct percpu_counter tcp_sockets_allocated;


diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 8ed7c32ae28e..540279f4c531 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c
@@ -29,6 +29,7 @@
29	static int zero;	29	static int zero;
30	static int one = 1;	30	static int one = 1;
31	static int four = 4;	31	static int four = 4;
		32	static int gso_max_segs = GSO_MAX_SEGS;
32	static int tcp_retr1_max = 255;	33	static int tcp_retr1_max = 255;
33	static int ip_local_port_range_min[] = { 1, 1 };	34	static int ip_local_port_range_min[] = { 1, 1 };
34	static int ip_local_port_range_max[] = { 65535, 65535 };	35	static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -761,6 +762,15 @@ static struct ctl_table ipv4_table[] = {
761	.extra2 = &four,	762	.extra2 = &four,
762	},	763	},
763	{	764	{
		765	.procname = "tcp_min_tso_segs",
		766	.data = &sysctl_tcp_min_tso_segs,
		767	.maxlen = sizeof(int),
		768	.mode = 0644,
		769	.proc_handler = proc_dointvec_minmax,
		770	.extra1 = &zero,
		771	.extra2 = &gso_max_segs,
		772	},
		773	{
764	.procname = "udp_mem",	774	.procname = "udp_mem",
765	.data = &sysctl_udp_mem,	775	.data = &sysctl_udp_mem,
766	.maxlen = sizeof(sysctl_udp_mem),	776	.maxlen = sizeof(sysctl_udp_mem),


diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4e42c03859f4..fdf74090a001 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c
@@ -283,6 +283,8 @@
283		283
284	int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;	284	int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
285		285
		286	int sysctl_tcp_min_tso_segs __read_mostly = 2;
		287
286	struct percpu_counter tcp_orphan_count;	288	struct percpu_counter tcp_orphan_count;
287	EXPORT_SYMBOL_GPL(tcp_orphan_count);	289	EXPORT_SYMBOL_GPL(tcp_orphan_count);
288		290
@@ -785,12 +787,28 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
785	xmit_size_goal = mss_now;	787	xmit_size_goal = mss_now;
786		788
787	if (large_allowed && sk_can_gso(sk)) {	789	if (large_allowed && sk_can_gso(sk)) {
788	xmit_size_goal = ((sk->sk_gso_max_size - 1) -	790	u32 gso_size, hlen;
789	inet_csk(sk)->icsk_af_ops->net_header_len -	791
790	inet_csk(sk)->icsk_ext_hdr_len -	792	/* Maybe we should/could use sk->sk_prot->max_header here ? */
791	tp->tcp_header_len);	793	hlen = inet_csk(sk)->icsk_af_ops->net_header_len +
		794	inet_csk(sk)->icsk_ext_hdr_len +
		795	tp->tcp_header_len;
		796
		797	/* Goal is to send at least one packet per ms,
		798	* not one big TSO packet every 100 ms.
		799	* This preserves ACK clocking and is consistent
		800	* with tcp_tso_should_defer() heuristic.
		801	*/
		802	gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC);
		803	gso_size = max_t(u32, gso_size,
		804	sysctl_tcp_min_tso_segs * mss_now);
		805
		806	xmit_size_goal = min_t(u32, gso_size,
		807	sk->sk_gso_max_size - 1 - hlen);
792		808
793	/* TSQ : try to have two TSO segments in flight */	809	/* TSQ : try to have at least two segments in flight
		810	* (one in NIC TX ring, another in Qdisc)
		811	*/
794	xmit_size_goal = min_t(u32, xmit_size_goal,	812	xmit_size_goal = min_t(u32, xmit_size_goal,
795	sysctl_tcp_limit_output_bytes >> 1);	813	sysctl_tcp_limit_output_bytes >> 1);
796		814


diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ec492eae0cd7..1a84fffe6993 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c
@@ -688,6 +688,34 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
688	}	688	}
689	}	689	}
690		690
		691	/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
		692	* Note: TCP stack does not yet implement pacing.
		693	* FQ packet scheduler can be used to implement cheap but effective
		694	* TCP pacing, to smooth the burst on large writes when packets
		695	* in flight is significantly lower than cwnd (or rwin)
		696	*/
		697	static void tcp_update_pacing_rate(struct sock *sk)
		698	{
		699	const struct tcp_sock *tp = tcp_sk(sk);
		700	u64 rate;
		701
		702	/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
		703	rate = (u64)tp->mss_cache * 2 * (HZ << 3);
		704
		705	rate *= max(tp->snd_cwnd, tp->packets_out);
		706
		707	/* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3),
		708	* be conservative and assume srtt = 1 (125 us instead of 1.25 ms)
		709	* We probably need usec resolution in the future.
		710	* Note: This also takes care of possible srtt=0 case,
		711	* when tcp_rtt_estimator() was not yet called.
		712	*/
		713	if (tp->srtt > 8 + 2)
		714	do_div(rate, tp->srtt);
		715
		716	sk->sk_pacing_rate = min_t(u64, rate, ~0U);
		717	}
		718
691	/* Calculate rto without backoff. This is the second half of Van Jacobson's	719	/* Calculate rto without backoff. This is the second half of Van Jacobson's
692	* routine referred to above.	720	* routine referred to above.
693	*/	721	*/
@@ -3278,7 +3306,7 @@ static int tcp_ack(struct sock sk, const struct sk_buff skb, int flag)
3278	u32 ack_seq = TCP_SKB_CB(skb)->seq;	3306	u32 ack_seq = TCP_SKB_CB(skb)->seq;
3279	u32 ack = TCP_SKB_CB(skb)->ack_seq;	3307	u32 ack = TCP_SKB_CB(skb)->ack_seq;
3280	bool is_dupack = false;	3308	bool is_dupack = false;
3281	u32 prior_in_flight;	3309	u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt;
3282	u32 prior_fackets;	3310	u32 prior_fackets;
3283	int prior_packets = tp->packets_out;	3311	int prior_packets = tp->packets_out;
3284	const int prior_unsacked = tp->packets_out - tp->sacked_out;	3312	const int prior_unsacked = tp->packets_out - tp->sacked_out;
@@ -3383,6 +3411,8 @@ static int tcp_ack(struct sock sk, const struct sk_buff skb, int flag)
3383		3411
3384	if (icsk->icsk_pending == ICSK_TIME_RETRANS)	3412	if (icsk->icsk_pending == ICSK_TIME_RETRANS)
3385	tcp_schedule_loss_probe(sk);	3413	tcp_schedule_loss_probe(sk);
		3414	if (tp->srtt != prior_rtt \|\| tp->snd_cwnd != prior_cwnd)
		3415	tcp_update_pacing_rate(sk);
3386	return 1;	3416	return 1;
3387		3417
3388	no_queue:	3418	no_queue:


diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 884efff5b531..e63ae4c9691d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c
@@ -1631,7 +1631,7 @@ static bool tcp_tso_should_defer(struct sock sk, struct sk_buff skb)
1631		1631
1632	/* If a full-sized TSO skb can be sent, do it. */	1632	/* If a full-sized TSO skb can be sent, do it. */
1633	if (limit >= min_t(unsigned int, sk->sk_gso_max_size,	1633	if (limit >= min_t(unsigned int, sk->sk_gso_max_size,
1634	sk->sk_gso_max_segs * tp->mss_cache))	1634	tp->xmit_size_goal_segs * tp->mss_cache))
1635	goto send_now;	1635	goto send_now;
1636		1636
1637	/* Middle in queue won't get any more data, full sendable already? */	1637	/* Middle in queue won't get any more data, full sendable already? */