diff options
author | Eric Dumazet <edumazet@google.com> | 2014-12-07 15:22:18 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2014-12-09 16:39:22 -0500 |
commit | 605ad7f184b60cfaacbc038aa6c55ee68dee3c89 (patch) | |
tree | e4c88937452f13283365fdcd4d1b5a900c6084a7 /net/ipv4 | |
parent | 5e84e189ce1323978afebfba89d3c18cd3f3643c (diff) |
tcp: refine TSO autosizing
Commit 95bd09eb2750 ("tcp: TSO packets automatic sizing") tried to
control TSO size, but did this at the wrong place (sendmsg() time)
At sendmsg() time, we might have a pessimistic view of flow rate,
and we end up building very small skbs (with 2 MSS per skb).
This is bad because :
- It sends small TSO packets even in Slow Start where rate quickly
increases.
- It tends to make socket write queue very big, increasing tcp_ack()
processing time, but also increasing memory needs, not necessarily
accounted for, as fast clones overhead is currently ignored.
- Lower GRO efficiency and more ACK packets.
Servers with a lot of small lived connections suffer from this.
Lets instead fill skbs as much as possible (64KB of payload), but split
them at xmit time, when we have a precise idea of the flow rate.
skb split is actually quite efficient.
Patch looks bigger than necessary, because TCP Small Queue decision now
has to take place after the eventual split.
As Neal suggested, introduce a new tcp_tso_autosize() helper, so that
tcp_tso_should_defer() can be synchronized on same goal.
Rename tp->xmit_size_goal_segs to tp->gso_segs, as this variable
contains number of mss that we can put in GSO packet, and is not
related to the autosizing goal anymore.
Tested:
40 ms rtt link
nstat >/dev/null
netperf -H remote -l -2000000 -- -s 1000000
nstat | egrep "IpInReceives|IpOutRequests|TcpOutSegs|IpExtOutOctets"
Before patch :
Recv Send Send
Socket Socket Message Elapsed
Size Size Size Time Throughput
bytes bytes bytes secs. 10^6bits/s
87380 2000000 2000000 0.36 44.22
IpInReceives 600 0.0
IpOutRequests 599 0.0
TcpOutSegs 1397 0.0
IpExtOutOctets 2033249 0.0
After patch :
Recv Send Send
Socket Socket Message Elapsed
Size Size Size Time Throughput
bytes bytes bytes secs. 10^6bits/sec
87380 2000000 2000000 0.36 44.27
IpInReceives 221 0.0
IpOutRequests 232 0.0
TcpOutSegs 1397 0.0
IpExtOutOctets 2013953 0.0
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/tcp.c | 60 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 59 |
2 files changed, 62 insertions, 57 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index dc13a3657e8e..427aee33ffc0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -835,47 +835,29 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, | |||
835 | int large_allowed) | 835 | int large_allowed) |
836 | { | 836 | { |
837 | struct tcp_sock *tp = tcp_sk(sk); | 837 | struct tcp_sock *tp = tcp_sk(sk); |
838 | u32 xmit_size_goal, old_size_goal; | 838 | u32 new_size_goal, size_goal, hlen; |
839 | 839 | ||
840 | xmit_size_goal = mss_now; | 840 | if (!large_allowed || !sk_can_gso(sk)) |
841 | 841 | return mss_now; | |
842 | if (large_allowed && sk_can_gso(sk)) { | 842 | |
843 | u32 gso_size, hlen; | 843 | /* Maybe we should/could use sk->sk_prot->max_header here ? */ |
844 | 844 | hlen = inet_csk(sk)->icsk_af_ops->net_header_len + | |
845 | /* Maybe we should/could use sk->sk_prot->max_header here ? */ | 845 | inet_csk(sk)->icsk_ext_hdr_len + |
846 | hlen = inet_csk(sk)->icsk_af_ops->net_header_len + | 846 | tp->tcp_header_len; |
847 | inet_csk(sk)->icsk_ext_hdr_len + | 847 | |
848 | tp->tcp_header_len; | 848 | new_size_goal = sk->sk_gso_max_size - 1 - hlen; |
849 | 849 | new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal); | |
850 | /* Goal is to send at least one packet per ms, | 850 | |
851 | * not one big TSO packet every 100 ms. | 851 | /* We try hard to avoid divides here */ |
852 | * This preserves ACK clocking and is consistent | 852 | size_goal = tp->gso_segs * mss_now; |
853 | * with tcp_tso_should_defer() heuristic. | 853 | if (unlikely(new_size_goal < size_goal || |
854 | */ | 854 | new_size_goal >= size_goal + mss_now)) { |
855 | gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC); | 855 | tp->gso_segs = min_t(u16, new_size_goal / mss_now, |
856 | gso_size = max_t(u32, gso_size, | 856 | sk->sk_gso_max_segs); |
857 | sysctl_tcp_min_tso_segs * mss_now); | 857 | size_goal = tp->gso_segs * mss_now; |
858 | |||
859 | xmit_size_goal = min_t(u32, gso_size, | ||
860 | sk->sk_gso_max_size - 1 - hlen); | ||
861 | |||
862 | xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); | ||
863 | |||
864 | /* We try hard to avoid divides here */ | ||
865 | old_size_goal = tp->xmit_size_goal_segs * mss_now; | ||
866 | |||
867 | if (likely(old_size_goal <= xmit_size_goal && | ||
868 | old_size_goal + mss_now > xmit_size_goal)) { | ||
869 | xmit_size_goal = old_size_goal; | ||
870 | } else { | ||
871 | tp->xmit_size_goal_segs = | ||
872 | min_t(u16, xmit_size_goal / mss_now, | ||
873 | sk->sk_gso_max_segs); | ||
874 | xmit_size_goal = tp->xmit_size_goal_segs * mss_now; | ||
875 | } | ||
876 | } | 858 | } |
877 | 859 | ||
878 | return max(xmit_size_goal, mss_now); | 860 | return max(size_goal, mss_now); |
879 | } | 861 | } |
880 | 862 | ||
881 | static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) | 863 | static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) |
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f5bd4bd3f7e6..f37ecf53ee8a 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -1524,6 +1524,27 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, | |||
1524 | ((nonagle & TCP_NAGLE_CORK) || | 1524 | ((nonagle & TCP_NAGLE_CORK) || |
1525 | (!nonagle && tp->packets_out && tcp_minshall_check(tp))); | 1525 | (!nonagle && tp->packets_out && tcp_minshall_check(tp))); |
1526 | } | 1526 | } |
1527 | |||
1528 | /* Return how many segs we'd like on a TSO packet, | ||
1529 | * to send one TSO packet per ms | ||
1530 | */ | ||
1531 | static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now) | ||
1532 | { | ||
1533 | u32 bytes, segs; | ||
1534 | |||
1535 | bytes = min(sk->sk_pacing_rate >> 10, | ||
1536 | sk->sk_gso_max_size - 1 - MAX_TCP_HEADER); | ||
1537 | |||
1538 | /* Goal is to send at least one packet per ms, | ||
1539 | * not one big TSO packet every 100 ms. | ||
1540 | * This preserves ACK clocking and is consistent | ||
1541 | * with tcp_tso_should_defer() heuristic. | ||
1542 | */ | ||
1543 | segs = max_t(u32, bytes / mss_now, sysctl_tcp_min_tso_segs); | ||
1544 | |||
1545 | return min_t(u32, segs, sk->sk_gso_max_segs); | ||
1546 | } | ||
1547 | |||
1527 | /* Returns the portion of skb which can be sent right away */ | 1548 | /* Returns the portion of skb which can be sent right away */ |
1528 | static unsigned int tcp_mss_split_point(const struct sock *sk, | 1549 | static unsigned int tcp_mss_split_point(const struct sock *sk, |
1529 | const struct sk_buff *skb, | 1550 | const struct sk_buff *skb, |
@@ -1731,7 +1752,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, | |||
1731 | * This algorithm is from John Heffner. | 1752 | * This algorithm is from John Heffner. |
1732 | */ | 1753 | */ |
1733 | static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, | 1754 | static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, |
1734 | bool *is_cwnd_limited) | 1755 | bool *is_cwnd_limited, u32 max_segs) |
1735 | { | 1756 | { |
1736 | struct tcp_sock *tp = tcp_sk(sk); | 1757 | struct tcp_sock *tp = tcp_sk(sk); |
1737 | const struct inet_connection_sock *icsk = inet_csk(sk); | 1758 | const struct inet_connection_sock *icsk = inet_csk(sk); |
@@ -1761,8 +1782,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, | |||
1761 | limit = min(send_win, cong_win); | 1782 | limit = min(send_win, cong_win); |
1762 | 1783 | ||
1763 | /* If a full-sized TSO skb can be sent, do it. */ | 1784 | /* If a full-sized TSO skb can be sent, do it. */ |
1764 | if (limit >= min_t(unsigned int, sk->sk_gso_max_size, | 1785 | if (limit >= max_segs * tp->mss_cache) |
1765 | tp->xmit_size_goal_segs * tp->mss_cache)) | ||
1766 | goto send_now; | 1786 | goto send_now; |
1767 | 1787 | ||
1768 | /* Middle in queue won't get any more data, full sendable already? */ | 1788 | /* Middle in queue won't get any more data, full sendable already? */ |
@@ -1959,6 +1979,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
1959 | int cwnd_quota; | 1979 | int cwnd_quota; |
1960 | int result; | 1980 | int result; |
1961 | bool is_cwnd_limited = false; | 1981 | bool is_cwnd_limited = false; |
1982 | u32 max_segs; | ||
1962 | 1983 | ||
1963 | sent_pkts = 0; | 1984 | sent_pkts = 0; |
1964 | 1985 | ||
@@ -1972,6 +1993,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
1972 | } | 1993 | } |
1973 | } | 1994 | } |
1974 | 1995 | ||
1996 | max_segs = tcp_tso_autosize(sk, mss_now); | ||
1975 | while ((skb = tcp_send_head(sk))) { | 1997 | while ((skb = tcp_send_head(sk))) { |
1976 | unsigned int limit; | 1998 | unsigned int limit; |
1977 | 1999 | ||
@@ -2004,10 +2026,23 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
2004 | break; | 2026 | break; |
2005 | } else { | 2027 | } else { |
2006 | if (!push_one && | 2028 | if (!push_one && |
2007 | tcp_tso_should_defer(sk, skb, &is_cwnd_limited)) | 2029 | tcp_tso_should_defer(sk, skb, &is_cwnd_limited, |
2030 | max_segs)) | ||
2008 | break; | 2031 | break; |
2009 | } | 2032 | } |
2010 | 2033 | ||
2034 | limit = mss_now; | ||
2035 | if (tso_segs > 1 && !tcp_urg_mode(tp)) | ||
2036 | limit = tcp_mss_split_point(sk, skb, mss_now, | ||
2037 | min_t(unsigned int, | ||
2038 | cwnd_quota, | ||
2039 | max_segs), | ||
2040 | nonagle); | ||
2041 | |||
2042 | if (skb->len > limit && | ||
2043 | unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) | ||
2044 | break; | ||
2045 | |||
2011 | /* TCP Small Queues : | 2046 | /* TCP Small Queues : |
2012 | * Control number of packets in qdisc/devices to two packets / or ~1 ms. | 2047 | * Control number of packets in qdisc/devices to two packets / or ~1 ms. |
2013 | * This allows for : | 2048 | * This allows for : |
@@ -2018,8 +2053,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
2018 | * of queued bytes to ensure line rate. | 2053 | * of queued bytes to ensure line rate. |
2019 | * One example is wifi aggregation (802.11 AMPDU) | 2054 | * One example is wifi aggregation (802.11 AMPDU) |
2020 | */ | 2055 | */ |
2021 | limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes, | 2056 | limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10); |
2022 | sk->sk_pacing_rate >> 10); | 2057 | limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes); |
2023 | 2058 | ||
2024 | if (atomic_read(&sk->sk_wmem_alloc) > limit) { | 2059 | if (atomic_read(&sk->sk_wmem_alloc) > limit) { |
2025 | set_bit(TSQ_THROTTLED, &tp->tsq_flags); | 2060 | set_bit(TSQ_THROTTLED, &tp->tsq_flags); |
@@ -2032,18 +2067,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
2032 | break; | 2067 | break; |
2033 | } | 2068 | } |
2034 | 2069 | ||
2035 | limit = mss_now; | ||
2036 | if (tso_segs > 1 && !tcp_urg_mode(tp)) | ||
2037 | limit = tcp_mss_split_point(sk, skb, mss_now, | ||
2038 | min_t(unsigned int, | ||
2039 | cwnd_quota, | ||
2040 | sk->sk_gso_max_segs), | ||
2041 | nonagle); | ||
2042 | |||
2043 | if (skb->len > limit && | ||
2044 | unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) | ||
2045 | break; | ||
2046 | |||
2047 | if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) | 2070 | if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) |
2048 | break; | 2071 | break; |
2049 | 2072 | ||