diff options
author | Eric Dumazet <edumazet@google.com> | 2013-08-27 08:46:32 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2013-08-29 15:50:06 -0400 |
commit | 95bd09eb27507691520d39ee1044d6ad831c1168 (patch) | |
tree | e05045cc6418ce08aa87d5f8c17366a7fa672f3c /net/ipv4 | |
parent | b800c3b966bcf004bd8592293a49ed5cb7ea67a9 (diff) |
tcp: TSO packets automatic sizing
After hearing many people over past years complaining against TSO being
bursty or even buggy, we are proud to present automatic sizing of TSO
packets.
One part of the problem is that tcp_tso_should_defer() uses an heuristic
relying on upcoming ACKS instead of a timer, but more generally, having
big TSO packets makes little sense for low rates, as it tends to create
micro bursts on the network, and general consensus is to reduce the
buffering amount.
This patch introduces a per socket sk_pacing_rate, that approximates
the current sending rate, and allows us to size the TSO packets so
that we try to send one packet every ms.
This field could be set by other transports.
Patch has no impact for high speed flows, where having large TSO packets
makes sense to reach line rate.
For other flows, this helps better packet scheduling and ACK clocking.
This patch increases performance of TCP flows in lossy environments.
A new sysctl (tcp_min_tso_segs) is added, to specify the
minimal size of a TSO packet (default being 2).
A follow-up patch will provide a new packet scheduler (FQ), using
sk_pacing_rate as an input to perform optional per flow pacing.
This explains why we chose to set sk_pacing_rate to twice the current
rate, allowing 'slow start' ramp up.
sk_pacing_rate = 2 * cwnd * mss / srtt
v2: Neal Cardwell reported a suspect deferring of last two segments on
initial write of 10 MSS, I had to change tcp_tso_should_defer() to take
into account tp->xmit_size_goal_segs
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Van Jacobson <vanj@google.com>
Cc: Tom Herbert <therbert@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 10 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 28 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 32 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 2 |
4 files changed, 65 insertions, 7 deletions
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 8ed7c32ae28e..540279f4c531 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
@@ -29,6 +29,7 @@ | |||
29 | static int zero; | 29 | static int zero; |
30 | static int one = 1; | 30 | static int one = 1; |
31 | static int four = 4; | 31 | static int four = 4; |
32 | static int gso_max_segs = GSO_MAX_SEGS; | ||
32 | static int tcp_retr1_max = 255; | 33 | static int tcp_retr1_max = 255; |
33 | static int ip_local_port_range_min[] = { 1, 1 }; | 34 | static int ip_local_port_range_min[] = { 1, 1 }; |
34 | static int ip_local_port_range_max[] = { 65535, 65535 }; | 35 | static int ip_local_port_range_max[] = { 65535, 65535 }; |
@@ -761,6 +762,15 @@ static struct ctl_table ipv4_table[] = { | |||
761 | .extra2 = &four, | 762 | .extra2 = &four, |
762 | }, | 763 | }, |
763 | { | 764 | { |
765 | .procname = "tcp_min_tso_segs", | ||
766 | .data = &sysctl_tcp_min_tso_segs, | ||
767 | .maxlen = sizeof(int), | ||
768 | .mode = 0644, | ||
769 | .proc_handler = proc_dointvec_minmax, | ||
770 | .extra1 = &zero, | ||
771 | .extra2 = &gso_max_segs, | ||
772 | }, | ||
773 | { | ||
764 | .procname = "udp_mem", | 774 | .procname = "udp_mem", |
765 | .data = &sysctl_udp_mem, | 775 | .data = &sysctl_udp_mem, |
766 | .maxlen = sizeof(sysctl_udp_mem), | 776 | .maxlen = sizeof(sysctl_udp_mem), |
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4e42c03859f4..fdf74090a001 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -283,6 +283,8 @@ | |||
283 | 283 | ||
284 | int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; | 284 | int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; |
285 | 285 | ||
286 | int sysctl_tcp_min_tso_segs __read_mostly = 2; | ||
287 | |||
286 | struct percpu_counter tcp_orphan_count; | 288 | struct percpu_counter tcp_orphan_count; |
287 | EXPORT_SYMBOL_GPL(tcp_orphan_count); | 289 | EXPORT_SYMBOL_GPL(tcp_orphan_count); |
288 | 290 | ||
@@ -785,12 +787,28 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, | |||
785 | xmit_size_goal = mss_now; | 787 | xmit_size_goal = mss_now; |
786 | 788 | ||
787 | if (large_allowed && sk_can_gso(sk)) { | 789 | if (large_allowed && sk_can_gso(sk)) { |
788 | xmit_size_goal = ((sk->sk_gso_max_size - 1) - | 790 | u32 gso_size, hlen; |
789 | inet_csk(sk)->icsk_af_ops->net_header_len - | 791 | |
790 | inet_csk(sk)->icsk_ext_hdr_len - | 792 | /* Maybe we should/could use sk->sk_prot->max_header here ? */ |
791 | tp->tcp_header_len); | 793 | hlen = inet_csk(sk)->icsk_af_ops->net_header_len + |
794 | inet_csk(sk)->icsk_ext_hdr_len + | ||
795 | tp->tcp_header_len; | ||
796 | |||
797 | /* Goal is to send at least one packet per ms, | ||
798 | * not one big TSO packet every 100 ms. | ||
799 | * This preserves ACK clocking and is consistent | ||
800 | * with tcp_tso_should_defer() heuristic. | ||
801 | */ | ||
802 | gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC); | ||
803 | gso_size = max_t(u32, gso_size, | ||
804 | sysctl_tcp_min_tso_segs * mss_now); | ||
805 | |||
806 | xmit_size_goal = min_t(u32, gso_size, | ||
807 | sk->sk_gso_max_size - 1 - hlen); | ||
792 | 808 | ||
793 | /* TSQ : try to have two TSO segments in flight */ | 809 | /* TSQ : try to have at least two segments in flight |
810 | * (one in NIC TX ring, another in Qdisc) | ||
811 | */ | ||
794 | xmit_size_goal = min_t(u32, xmit_size_goal, | 812 | xmit_size_goal = min_t(u32, xmit_size_goal, |
795 | sysctl_tcp_limit_output_bytes >> 1); | 813 | sysctl_tcp_limit_output_bytes >> 1); |
796 | 814 | ||
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ec492eae0cd7..1a84fffe6993 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -688,6 +688,34 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) | |||
688 | } | 688 | } |
689 | } | 689 | } |
690 | 690 | ||
691 | /* Set the sk_pacing_rate to allow proper sizing of TSO packets. | ||
692 | * Note: TCP stack does not yet implement pacing. | ||
693 | * FQ packet scheduler can be used to implement cheap but effective | ||
694 | * TCP pacing, to smooth the burst on large writes when packets | ||
695 | * in flight is significantly lower than cwnd (or rwin) | ||
696 | */ | ||
697 | static void tcp_update_pacing_rate(struct sock *sk) | ||
698 | { | ||
699 | const struct tcp_sock *tp = tcp_sk(sk); | ||
700 | u64 rate; | ||
701 | |||
702 | /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ | ||
703 | rate = (u64)tp->mss_cache * 2 * (HZ << 3); | ||
704 | |||
705 | rate *= max(tp->snd_cwnd, tp->packets_out); | ||
706 | |||
707 | /* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3), | ||
708 | * be conservative and assume srtt = 1 (125 us instead of 1.25 ms) | ||
709 | * We probably need usec resolution in the future. | ||
710 | * Note: This also takes care of possible srtt=0 case, | ||
711 | * when tcp_rtt_estimator() was not yet called. | ||
712 | */ | ||
713 | if (tp->srtt > 8 + 2) | ||
714 | do_div(rate, tp->srtt); | ||
715 | |||
716 | sk->sk_pacing_rate = min_t(u64, rate, ~0U); | ||
717 | } | ||
718 | |||
691 | /* Calculate rto without backoff. This is the second half of Van Jacobson's | 719 | /* Calculate rto without backoff. This is the second half of Van Jacobson's |
692 | * routine referred to above. | 720 | * routine referred to above. |
693 | */ | 721 | */ |
@@ -3278,7 +3306,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3278 | u32 ack_seq = TCP_SKB_CB(skb)->seq; | 3306 | u32 ack_seq = TCP_SKB_CB(skb)->seq; |
3279 | u32 ack = TCP_SKB_CB(skb)->ack_seq; | 3307 | u32 ack = TCP_SKB_CB(skb)->ack_seq; |
3280 | bool is_dupack = false; | 3308 | bool is_dupack = false; |
3281 | u32 prior_in_flight; | 3309 | u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt; |
3282 | u32 prior_fackets; | 3310 | u32 prior_fackets; |
3283 | int prior_packets = tp->packets_out; | 3311 | int prior_packets = tp->packets_out; |
3284 | const int prior_unsacked = tp->packets_out - tp->sacked_out; | 3312 | const int prior_unsacked = tp->packets_out - tp->sacked_out; |
@@ -3383,6 +3411,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3383 | 3411 | ||
3384 | if (icsk->icsk_pending == ICSK_TIME_RETRANS) | 3412 | if (icsk->icsk_pending == ICSK_TIME_RETRANS) |
3385 | tcp_schedule_loss_probe(sk); | 3413 | tcp_schedule_loss_probe(sk); |
3414 | if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd) | ||
3415 | tcp_update_pacing_rate(sk); | ||
3386 | return 1; | 3416 | return 1; |
3387 | 3417 | ||
3388 | no_queue: | 3418 | no_queue: |
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 884efff5b531..e63ae4c9691d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -1631,7 +1631,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) | |||
1631 | 1631 | ||
1632 | /* If a full-sized TSO skb can be sent, do it. */ | 1632 | /* If a full-sized TSO skb can be sent, do it. */ |
1633 | if (limit >= min_t(unsigned int, sk->sk_gso_max_size, | 1633 | if (limit >= min_t(unsigned int, sk->sk_gso_max_size, |
1634 | sk->sk_gso_max_segs * tp->mss_cache)) | 1634 | tp->xmit_size_goal_segs * tp->mss_cache)) |
1635 | goto send_now; | 1635 | goto send_now; |
1636 | 1636 | ||
1637 | /* Middle in queue won't get any more data, full sendable already? */ | 1637 | /* Middle in queue won't get any more data, full sendable already? */ |