diff options
-rw-r--r-- | Documentation/networking/ip-sysctl.txt | 9 | ||||
-rw-r--r-- | include/net/sock.h | 2 | ||||
-rw-r--r-- | include/net/tcp.h | 1 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 10 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 28 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 32 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 2 |
7 files changed, 77 insertions, 7 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index a2be556032c9..1cb3aeb4baff 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt | |||
@@ -482,6 +482,15 @@ tcp_syn_retries - INTEGER | |||
482 | tcp_timestamps - BOOLEAN | 482 | tcp_timestamps - BOOLEAN |
483 | Enable timestamps as defined in RFC1323. | 483 | Enable timestamps as defined in RFC1323. |
484 | 484 | ||
485 | tcp_min_tso_segs - INTEGER | ||
486 | Minimal number of segments per TSO frame. | ||
487 | Since linux-3.12, TCP does an automatic sizing of TSO frames, | ||
488 | depending on flow rate, instead of filling 64Kbytes packets. | ||
489 | For specific usages, it's possible to force TCP to build big | ||
490 | TSO frames. Note that TCP stack might split too big TSO packets | ||
491 | if available window is too small. | ||
492 | Default: 2 | ||
493 | |||
485 | tcp_tso_win_divisor - INTEGER | 494 | tcp_tso_win_divisor - INTEGER |
486 | This allows control over what percentage of the congestion window | 495 | This allows control over what percentage of the congestion window |
487 | can be consumed by a single TSO frame. | 496 | can be consumed by a single TSO frame. |
diff --git a/include/net/sock.h b/include/net/sock.h index e4bbcbfd07ea..6ba2e7b0e2b1 100644 --- a/include/net/sock.h +++ b/include/net/sock.h | |||
@@ -232,6 +232,7 @@ struct cg_proto; | |||
232 | * @sk_napi_id: id of the last napi context to receive data for sk | 232 | * @sk_napi_id: id of the last napi context to receive data for sk |
233 | * @sk_ll_usec: usecs to busypoll when there is no data | 233 | * @sk_ll_usec: usecs to busypoll when there is no data |
234 | * @sk_allocation: allocation mode | 234 | * @sk_allocation: allocation mode |
235 | * @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler) | ||
235 | * @sk_sndbuf: size of send buffer in bytes | 236 | * @sk_sndbuf: size of send buffer in bytes |
236 | * @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, | 237 | * @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, |
237 | * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings | 238 | * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings |
@@ -361,6 +362,7 @@ struct sock { | |||
361 | kmemcheck_bitfield_end(flags); | 362 | kmemcheck_bitfield_end(flags); |
362 | int sk_wmem_queued; | 363 | int sk_wmem_queued; |
363 | gfp_t sk_allocation; | 364 | gfp_t sk_allocation; |
365 | u32 sk_pacing_rate; /* bytes per second */ | ||
364 | netdev_features_t sk_route_caps; | 366 | netdev_features_t sk_route_caps; |
365 | netdev_features_t sk_route_nocaps; | 367 | netdev_features_t sk_route_nocaps; |
366 | int sk_gso_type; | 368 | int sk_gso_type; |
diff --git a/include/net/tcp.h b/include/net/tcp.h index dd5e16f66f84..6a6a88db462d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h | |||
@@ -281,6 +281,7 @@ extern int sysctl_tcp_early_retrans; | |||
281 | extern int sysctl_tcp_limit_output_bytes; | 281 | extern int sysctl_tcp_limit_output_bytes; |
282 | extern int sysctl_tcp_challenge_ack_limit; | 282 | extern int sysctl_tcp_challenge_ack_limit; |
283 | extern unsigned int sysctl_tcp_notsent_lowat; | 283 | extern unsigned int sysctl_tcp_notsent_lowat; |
284 | extern int sysctl_tcp_min_tso_segs; | ||
284 | 285 | ||
285 | extern atomic_long_t tcp_memory_allocated; | 286 | extern atomic_long_t tcp_memory_allocated; |
286 | extern struct percpu_counter tcp_sockets_allocated; | 287 | extern struct percpu_counter tcp_sockets_allocated; |
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 8ed7c32ae28e..540279f4c531 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
@@ -29,6 +29,7 @@ | |||
29 | static int zero; | 29 | static int zero; |
30 | static int one = 1; | 30 | static int one = 1; |
31 | static int four = 4; | 31 | static int four = 4; |
32 | static int gso_max_segs = GSO_MAX_SEGS; | ||
32 | static int tcp_retr1_max = 255; | 33 | static int tcp_retr1_max = 255; |
33 | static int ip_local_port_range_min[] = { 1, 1 }; | 34 | static int ip_local_port_range_min[] = { 1, 1 }; |
34 | static int ip_local_port_range_max[] = { 65535, 65535 }; | 35 | static int ip_local_port_range_max[] = { 65535, 65535 }; |
@@ -761,6 +762,15 @@ static struct ctl_table ipv4_table[] = { | |||
761 | .extra2 = &four, | 762 | .extra2 = &four, |
762 | }, | 763 | }, |
763 | { | 764 | { |
765 | .procname = "tcp_min_tso_segs", | ||
766 | .data = &sysctl_tcp_min_tso_segs, | ||
767 | .maxlen = sizeof(int), | ||
768 | .mode = 0644, | ||
769 | .proc_handler = proc_dointvec_minmax, | ||
770 | .extra1 = &zero, | ||
771 | .extra2 = &gso_max_segs, | ||
772 | }, | ||
773 | { | ||
764 | .procname = "udp_mem", | 774 | .procname = "udp_mem", |
765 | .data = &sysctl_udp_mem, | 775 | .data = &sysctl_udp_mem, |
766 | .maxlen = sizeof(sysctl_udp_mem), | 776 | .maxlen = sizeof(sysctl_udp_mem), |
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4e42c03859f4..fdf74090a001 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -283,6 +283,8 @@ | |||
283 | 283 | ||
284 | int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; | 284 | int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; |
285 | 285 | ||
286 | int sysctl_tcp_min_tso_segs __read_mostly = 2; | ||
287 | |||
286 | struct percpu_counter tcp_orphan_count; | 288 | struct percpu_counter tcp_orphan_count; |
287 | EXPORT_SYMBOL_GPL(tcp_orphan_count); | 289 | EXPORT_SYMBOL_GPL(tcp_orphan_count); |
288 | 290 | ||
@@ -785,12 +787,28 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, | |||
785 | xmit_size_goal = mss_now; | 787 | xmit_size_goal = mss_now; |
786 | 788 | ||
787 | if (large_allowed && sk_can_gso(sk)) { | 789 | if (large_allowed && sk_can_gso(sk)) { |
788 | xmit_size_goal = ((sk->sk_gso_max_size - 1) - | 790 | u32 gso_size, hlen; |
789 | inet_csk(sk)->icsk_af_ops->net_header_len - | 791 | |
790 | inet_csk(sk)->icsk_ext_hdr_len - | 792 | /* Maybe we should/could use sk->sk_prot->max_header here ? */ |
791 | tp->tcp_header_len); | 793 | hlen = inet_csk(sk)->icsk_af_ops->net_header_len + |
794 | inet_csk(sk)->icsk_ext_hdr_len + | ||
795 | tp->tcp_header_len; | ||
796 | |||
797 | /* Goal is to send at least one packet per ms, | ||
798 | * not one big TSO packet every 100 ms. | ||
799 | * This preserves ACK clocking and is consistent | ||
800 | * with tcp_tso_should_defer() heuristic. | ||
801 | */ | ||
802 | gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC); | ||
803 | gso_size = max_t(u32, gso_size, | ||
804 | sysctl_tcp_min_tso_segs * mss_now); | ||
805 | |||
806 | xmit_size_goal = min_t(u32, gso_size, | ||
807 | sk->sk_gso_max_size - 1 - hlen); | ||
792 | 808 | ||
793 | /* TSQ : try to have two TSO segments in flight */ | 809 | /* TSQ : try to have at least two segments in flight |
810 | * (one in NIC TX ring, another in Qdisc) | ||
811 | */ | ||
794 | xmit_size_goal = min_t(u32, xmit_size_goal, | 812 | xmit_size_goal = min_t(u32, xmit_size_goal, |
795 | sysctl_tcp_limit_output_bytes >> 1); | 813 | sysctl_tcp_limit_output_bytes >> 1); |
796 | 814 | ||
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ec492eae0cd7..1a84fffe6993 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -688,6 +688,34 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) | |||
688 | } | 688 | } |
689 | } | 689 | } |
690 | 690 | ||
691 | /* Set the sk_pacing_rate to allow proper sizing of TSO packets. | ||
692 | * Note: TCP stack does not yet implement pacing. | ||
693 | * FQ packet scheduler can be used to implement cheap but effective | ||
694 | * TCP pacing, to smooth the burst on large writes when packets | ||
695 | * in flight is significantly lower than cwnd (or rwin) | ||
696 | */ | ||
697 | static void tcp_update_pacing_rate(struct sock *sk) | ||
698 | { | ||
699 | const struct tcp_sock *tp = tcp_sk(sk); | ||
700 | u64 rate; | ||
701 | |||
702 | /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ | ||
703 | rate = (u64)tp->mss_cache * 2 * (HZ << 3); | ||
704 | |||
705 | rate *= max(tp->snd_cwnd, tp->packets_out); | ||
706 | |||
707 | /* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3), | ||
708 | * be conservative and assume srtt = 1 (125 us instead of 1.25 ms) | ||
709 | * We probably need usec resolution in the future. | ||
710 | * Note: This also takes care of possible srtt=0 case, | ||
711 | * when tcp_rtt_estimator() was not yet called. | ||
712 | */ | ||
713 | if (tp->srtt > 8 + 2) | ||
714 | do_div(rate, tp->srtt); | ||
715 | |||
716 | sk->sk_pacing_rate = min_t(u64, rate, ~0U); | ||
717 | } | ||
718 | |||
691 | /* Calculate rto without backoff. This is the second half of Van Jacobson's | 719 | /* Calculate rto without backoff. This is the second half of Van Jacobson's |
692 | * routine referred to above. | 720 | * routine referred to above. |
693 | */ | 721 | */ |
@@ -3278,7 +3306,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3278 | u32 ack_seq = TCP_SKB_CB(skb)->seq; | 3306 | u32 ack_seq = TCP_SKB_CB(skb)->seq; |
3279 | u32 ack = TCP_SKB_CB(skb)->ack_seq; | 3307 | u32 ack = TCP_SKB_CB(skb)->ack_seq; |
3280 | bool is_dupack = false; | 3308 | bool is_dupack = false; |
3281 | u32 prior_in_flight; | 3309 | u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt; |
3282 | u32 prior_fackets; | 3310 | u32 prior_fackets; |
3283 | int prior_packets = tp->packets_out; | 3311 | int prior_packets = tp->packets_out; |
3284 | const int prior_unsacked = tp->packets_out - tp->sacked_out; | 3312 | const int prior_unsacked = tp->packets_out - tp->sacked_out; |
@@ -3383,6 +3411,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3383 | 3411 | ||
3384 | if (icsk->icsk_pending == ICSK_TIME_RETRANS) | 3412 | if (icsk->icsk_pending == ICSK_TIME_RETRANS) |
3385 | tcp_schedule_loss_probe(sk); | 3413 | tcp_schedule_loss_probe(sk); |
3414 | if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd) | ||
3415 | tcp_update_pacing_rate(sk); | ||
3386 | return 1; | 3416 | return 1; |
3387 | 3417 | ||
3388 | no_queue: | 3418 | no_queue: |
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 884efff5b531..e63ae4c9691d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -1631,7 +1631,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) | |||
1631 | 1631 | ||
1632 | /* If a full-sized TSO skb can be sent, do it. */ | 1632 | /* If a full-sized TSO skb can be sent, do it. */ |
1633 | if (limit >= min_t(unsigned int, sk->sk_gso_max_size, | 1633 | if (limit >= min_t(unsigned int, sk->sk_gso_max_size, |
1634 | sk->sk_gso_max_segs * tp->mss_cache)) | 1634 | tp->xmit_size_goal_segs * tp->mss_cache)) |
1635 | goto send_now; | 1635 | goto send_now; |
1636 | 1636 | ||
1637 | /* Middle in queue won't get any more data, full sendable already? */ | 1637 | /* Middle in queue won't get any more data, full sendable already? */ |