aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2013-08-27 08:46:32 -0400
committerDavid S. Miller <davem@davemloft.net>2013-08-29 15:50:06 -0400
commit95bd09eb27507691520d39ee1044d6ad831c1168 (patch)
treee05045cc6418ce08aa87d5f8c17366a7fa672f3c
parentb800c3b966bcf004bd8592293a49ed5cb7ea67a9 (diff)
tcp: TSO packets automatic sizing
After hearing many people over past years complaining against TSO being bursty or even buggy, we are proud to present automatic sizing of TSO packets. One part of the problem is that tcp_tso_should_defer() uses an heuristic relying on upcoming ACKS instead of a timer, but more generally, having big TSO packets makes little sense for low rates, as it tends to create micro bursts on the network, and general consensus is to reduce the buffering amount. This patch introduces a per socket sk_pacing_rate, that approximates the current sending rate, and allows us to size the TSO packets so that we try to send one packet every ms. This field could be set by other transports. Patch has no impact for high speed flows, where having large TSO packets makes sense to reach line rate. For other flows, this helps better packet scheduling and ACK clocking. This patch increases performance of TCP flows in lossy environments. A new sysctl (tcp_min_tso_segs) is added, to specify the minimal size of a TSO packet (default being 2). A follow-up patch will provide a new packet scheduler (FQ), using sk_pacing_rate as an input to perform optional per flow pacing. This explains why we chose to set sk_pacing_rate to twice the current rate, allowing 'slow start' ramp up. sk_pacing_rate = 2 * cwnd * mss / srtt v2: Neal Cardwell reported a suspect deferring of last two segments on initial write of 10 MSS, I had to change tcp_tso_should_defer() to take into account tp->xmit_size_goal_segs Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Neal Cardwell <ncardwell@google.com> Cc: Yuchung Cheng <ycheng@google.com> Cc: Van Jacobson <vanj@google.com> Cc: Tom Herbert <therbert@google.com> Acked-by: Yuchung Cheng <ycheng@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--Documentation/networking/ip-sysctl.txt9
-rw-r--r--include/net/sock.h2
-rw-r--r--include/net/tcp.h1
-rw-r--r--net/ipv4/sysctl_net_ipv4.c10
-rw-r--r--net/ipv4/tcp.c28
-rw-r--r--net/ipv4/tcp_input.c32
-rw-r--r--net/ipv4/tcp_output.c2
7 files changed, 77 insertions, 7 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index a2be556032c9..1cb3aeb4baff 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -482,6 +482,15 @@ tcp_syn_retries - INTEGER
482tcp_timestamps - BOOLEAN 482tcp_timestamps - BOOLEAN
483 Enable timestamps as defined in RFC1323. 483 Enable timestamps as defined in RFC1323.
484 484
485tcp_min_tso_segs - INTEGER
486 Minimal number of segments per TSO frame.
487 Since linux-3.12, TCP does an automatic sizing of TSO frames,
488 depending on flow rate, instead of filling 64Kbytes packets.
489 For specific usages, it's possible to force TCP to build big
490 TSO frames. Note that TCP stack might split too big TSO packets
491 if available window is too small.
492 Default: 2
493
485tcp_tso_win_divisor - INTEGER 494tcp_tso_win_divisor - INTEGER
486 This allows control over what percentage of the congestion window 495 This allows control over what percentage of the congestion window
487 can be consumed by a single TSO frame. 496 can be consumed by a single TSO frame.
diff --git a/include/net/sock.h b/include/net/sock.h
index e4bbcbfd07ea..6ba2e7b0e2b1 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -232,6 +232,7 @@ struct cg_proto;
232 * @sk_napi_id: id of the last napi context to receive data for sk 232 * @sk_napi_id: id of the last napi context to receive data for sk
233 * @sk_ll_usec: usecs to busypoll when there is no data 233 * @sk_ll_usec: usecs to busypoll when there is no data
234 * @sk_allocation: allocation mode 234 * @sk_allocation: allocation mode
235 * @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler)
235 * @sk_sndbuf: size of send buffer in bytes 236 * @sk_sndbuf: size of send buffer in bytes
236 * @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, 237 * @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
237 * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings 238 * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
@@ -361,6 +362,7 @@ struct sock {
361 kmemcheck_bitfield_end(flags); 362 kmemcheck_bitfield_end(flags);
362 int sk_wmem_queued; 363 int sk_wmem_queued;
363 gfp_t sk_allocation; 364 gfp_t sk_allocation;
365 u32 sk_pacing_rate; /* bytes per second */
364 netdev_features_t sk_route_caps; 366 netdev_features_t sk_route_caps;
365 netdev_features_t sk_route_nocaps; 367 netdev_features_t sk_route_nocaps;
366 int sk_gso_type; 368 int sk_gso_type;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index dd5e16f66f84..6a6a88db462d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -281,6 +281,7 @@ extern int sysctl_tcp_early_retrans;
281extern int sysctl_tcp_limit_output_bytes; 281extern int sysctl_tcp_limit_output_bytes;
282extern int sysctl_tcp_challenge_ack_limit; 282extern int sysctl_tcp_challenge_ack_limit;
283extern unsigned int sysctl_tcp_notsent_lowat; 283extern unsigned int sysctl_tcp_notsent_lowat;
284extern int sysctl_tcp_min_tso_segs;
284 285
285extern atomic_long_t tcp_memory_allocated; 286extern atomic_long_t tcp_memory_allocated;
286extern struct percpu_counter tcp_sockets_allocated; 287extern struct percpu_counter tcp_sockets_allocated;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 8ed7c32ae28e..540279f4c531 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -29,6 +29,7 @@
29static int zero; 29static int zero;
30static int one = 1; 30static int one = 1;
31static int four = 4; 31static int four = 4;
32static int gso_max_segs = GSO_MAX_SEGS;
32static int tcp_retr1_max = 255; 33static int tcp_retr1_max = 255;
33static int ip_local_port_range_min[] = { 1, 1 }; 34static int ip_local_port_range_min[] = { 1, 1 };
34static int ip_local_port_range_max[] = { 65535, 65535 }; 35static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -761,6 +762,15 @@ static struct ctl_table ipv4_table[] = {
761 .extra2 = &four, 762 .extra2 = &four,
762 }, 763 },
763 { 764 {
765 .procname = "tcp_min_tso_segs",
766 .data = &sysctl_tcp_min_tso_segs,
767 .maxlen = sizeof(int),
768 .mode = 0644,
769 .proc_handler = proc_dointvec_minmax,
770 .extra1 = &zero,
771 .extra2 = &gso_max_segs,
772 },
773 {
764 .procname = "udp_mem", 774 .procname = "udp_mem",
765 .data = &sysctl_udp_mem, 775 .data = &sysctl_udp_mem,
766 .maxlen = sizeof(sysctl_udp_mem), 776 .maxlen = sizeof(sysctl_udp_mem),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4e42c03859f4..fdf74090a001 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -283,6 +283,8 @@
283 283
284int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; 284int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
285 285
286int sysctl_tcp_min_tso_segs __read_mostly = 2;
287
286struct percpu_counter tcp_orphan_count; 288struct percpu_counter tcp_orphan_count;
287EXPORT_SYMBOL_GPL(tcp_orphan_count); 289EXPORT_SYMBOL_GPL(tcp_orphan_count);
288 290
@@ -785,12 +787,28 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
785 xmit_size_goal = mss_now; 787 xmit_size_goal = mss_now;
786 788
787 if (large_allowed && sk_can_gso(sk)) { 789 if (large_allowed && sk_can_gso(sk)) {
788 xmit_size_goal = ((sk->sk_gso_max_size - 1) - 790 u32 gso_size, hlen;
789 inet_csk(sk)->icsk_af_ops->net_header_len - 791
790 inet_csk(sk)->icsk_ext_hdr_len - 792 /* Maybe we should/could use sk->sk_prot->max_header here ? */
791 tp->tcp_header_len); 793 hlen = inet_csk(sk)->icsk_af_ops->net_header_len +
794 inet_csk(sk)->icsk_ext_hdr_len +
795 tp->tcp_header_len;
796
797 /* Goal is to send at least one packet per ms,
798 * not one big TSO packet every 100 ms.
799 * This preserves ACK clocking and is consistent
800 * with tcp_tso_should_defer() heuristic.
801 */
802 gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC);
803 gso_size = max_t(u32, gso_size,
804 sysctl_tcp_min_tso_segs * mss_now);
805
806 xmit_size_goal = min_t(u32, gso_size,
807 sk->sk_gso_max_size - 1 - hlen);
792 808
793 /* TSQ : try to have two TSO segments in flight */ 809 /* TSQ : try to have at least two segments in flight
810 * (one in NIC TX ring, another in Qdisc)
811 */
794 xmit_size_goal = min_t(u32, xmit_size_goal, 812 xmit_size_goal = min_t(u32, xmit_size_goal,
795 sysctl_tcp_limit_output_bytes >> 1); 813 sysctl_tcp_limit_output_bytes >> 1);
796 814
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ec492eae0cd7..1a84fffe6993 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -688,6 +688,34 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
688 } 688 }
689} 689}
690 690
691/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
692 * Note: TCP stack does not yet implement pacing.
693 * FQ packet scheduler can be used to implement cheap but effective
694 * TCP pacing, to smooth the burst on large writes when packets
695 * in flight is significantly lower than cwnd (or rwin)
696 */
697static void tcp_update_pacing_rate(struct sock *sk)
698{
699 const struct tcp_sock *tp = tcp_sk(sk);
700 u64 rate;
701
702 /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
703 rate = (u64)tp->mss_cache * 2 * (HZ << 3);
704
705 rate *= max(tp->snd_cwnd, tp->packets_out);
706
707 /* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3),
708 * be conservative and assume srtt = 1 (125 us instead of 1.25 ms)
709 * We probably need usec resolution in the future.
710 * Note: This also takes care of possible srtt=0 case,
711 * when tcp_rtt_estimator() was not yet called.
712 */
713 if (tp->srtt > 8 + 2)
714 do_div(rate, tp->srtt);
715
716 sk->sk_pacing_rate = min_t(u64, rate, ~0U);
717}
718
691/* Calculate rto without backoff. This is the second half of Van Jacobson's 719/* Calculate rto without backoff. This is the second half of Van Jacobson's
692 * routine referred to above. 720 * routine referred to above.
693 */ 721 */
@@ -3278,7 +3306,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3278 u32 ack_seq = TCP_SKB_CB(skb)->seq; 3306 u32 ack_seq = TCP_SKB_CB(skb)->seq;
3279 u32 ack = TCP_SKB_CB(skb)->ack_seq; 3307 u32 ack = TCP_SKB_CB(skb)->ack_seq;
3280 bool is_dupack = false; 3308 bool is_dupack = false;
3281 u32 prior_in_flight; 3309 u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt;
3282 u32 prior_fackets; 3310 u32 prior_fackets;
3283 int prior_packets = tp->packets_out; 3311 int prior_packets = tp->packets_out;
3284 const int prior_unsacked = tp->packets_out - tp->sacked_out; 3312 const int prior_unsacked = tp->packets_out - tp->sacked_out;
@@ -3383,6 +3411,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3383 3411
3384 if (icsk->icsk_pending == ICSK_TIME_RETRANS) 3412 if (icsk->icsk_pending == ICSK_TIME_RETRANS)
3385 tcp_schedule_loss_probe(sk); 3413 tcp_schedule_loss_probe(sk);
3414 if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd)
3415 tcp_update_pacing_rate(sk);
3386 return 1; 3416 return 1;
3387 3417
3388no_queue: 3418no_queue:
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 884efff5b531..e63ae4c9691d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1631,7 +1631,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1631 1631
1632 /* If a full-sized TSO skb can be sent, do it. */ 1632 /* If a full-sized TSO skb can be sent, do it. */
1633 if (limit >= min_t(unsigned int, sk->sk_gso_max_size, 1633 if (limit >= min_t(unsigned int, sk->sk_gso_max_size,
1634 sk->sk_gso_max_segs * tp->mss_cache)) 1634 tp->xmit_size_goal_segs * tp->mss_cache))
1635 goto send_now; 1635 goto send_now;
1636 1636
1637 /* Middle in queue won't get any more data, full sendable already? */ 1637 /* Middle in queue won't get any more data, full sendable already? */