aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2014-12-07 15:22:18 -0500
committerDavid S. Miller <davem@davemloft.net>2014-12-09 16:39:22 -0500
commit605ad7f184b60cfaacbc038aa6c55ee68dee3c89 (patch)
treee4c88937452f13283365fdcd4d1b5a900c6084a7 /net/ipv4
parent5e84e189ce1323978afebfba89d3c18cd3f3643c (diff)
tcp: refine TSO autosizing
Commit 95bd09eb2750 ("tcp: TSO packets automatic sizing") tried to control TSO size, but did this at the wrong place (sendmsg() time) At sendmsg() time, we might have a pessimistic view of flow rate, and we end up building very small skbs (with 2 MSS per skb). This is bad because : - It sends small TSO packets even in Slow Start where rate quickly increases. - It tends to make socket write queue very big, increasing tcp_ack() processing time, but also increasing memory needs, not necessarily accounted for, as fast clones overhead is currently ignored. - Lower GRO efficiency and more ACK packets. Servers with a lot of small lived connections suffer from this. Lets instead fill skbs as much as possible (64KB of payload), but split them at xmit time, when we have a precise idea of the flow rate. skb split is actually quite efficient. Patch looks bigger than necessary, because TCP Small Queue decision now has to take place after the eventual split. As Neal suggested, introduce a new tcp_tso_autosize() helper, so that tcp_tso_should_defer() can be synchronized on same goal. Rename tp->xmit_size_goal_segs to tp->gso_segs, as this variable contains number of mss that we can put in GSO packet, and is not related to the autosizing goal anymore. Tested: 40 ms rtt link nstat >/dev/null netperf -H remote -l -2000000 -- -s 1000000 nstat | egrep "IpInReceives|IpOutRequests|TcpOutSegs|IpExtOutOctets" Before patch : Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/s 87380 2000000 2000000 0.36 44.22 IpInReceives 600 0.0 IpOutRequests 599 0.0 TcpOutSegs 1397 0.0 IpExtOutOctets 2033249 0.0 After patch : Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 87380 2000000 2000000 0.36 44.27 IpInReceives 221 0.0 IpOutRequests 232 0.0 TcpOutSegs 1397 0.0 IpExtOutOctets 2013953 0.0 Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Acked-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/tcp.c60
-rw-r--r--net/ipv4/tcp_output.c59
2 files changed, 62 insertions, 57 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index dc13a3657e8e..427aee33ffc0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -835,47 +835,29 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
835 int large_allowed) 835 int large_allowed)
836{ 836{
837 struct tcp_sock *tp = tcp_sk(sk); 837 struct tcp_sock *tp = tcp_sk(sk);
838 u32 xmit_size_goal, old_size_goal; 838 u32 new_size_goal, size_goal, hlen;
839 839
840 xmit_size_goal = mss_now; 840 if (!large_allowed || !sk_can_gso(sk))
841 841 return mss_now;
842 if (large_allowed && sk_can_gso(sk)) { 842
843 u32 gso_size, hlen; 843 /* Maybe we should/could use sk->sk_prot->max_header here ? */
844 844 hlen = inet_csk(sk)->icsk_af_ops->net_header_len +
845 /* Maybe we should/could use sk->sk_prot->max_header here ? */ 845 inet_csk(sk)->icsk_ext_hdr_len +
846 hlen = inet_csk(sk)->icsk_af_ops->net_header_len + 846 tp->tcp_header_len;
847 inet_csk(sk)->icsk_ext_hdr_len + 847
848 tp->tcp_header_len; 848 new_size_goal = sk->sk_gso_max_size - 1 - hlen;
849 849 new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
850 /* Goal is to send at least one packet per ms, 850
851 * not one big TSO packet every 100 ms. 851 /* We try hard to avoid divides here */
852 * This preserves ACK clocking and is consistent 852 size_goal = tp->gso_segs * mss_now;
853 * with tcp_tso_should_defer() heuristic. 853 if (unlikely(new_size_goal < size_goal ||
854 */ 854 new_size_goal >= size_goal + mss_now)) {
855 gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC); 855 tp->gso_segs = min_t(u16, new_size_goal / mss_now,
856 gso_size = max_t(u32, gso_size, 856 sk->sk_gso_max_segs);
857 sysctl_tcp_min_tso_segs * mss_now); 857 size_goal = tp->gso_segs * mss_now;
858
859 xmit_size_goal = min_t(u32, gso_size,
860 sk->sk_gso_max_size - 1 - hlen);
861
862 xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
863
864 /* We try hard to avoid divides here */
865 old_size_goal = tp->xmit_size_goal_segs * mss_now;
866
867 if (likely(old_size_goal <= xmit_size_goal &&
868 old_size_goal + mss_now > xmit_size_goal)) {
869 xmit_size_goal = old_size_goal;
870 } else {
871 tp->xmit_size_goal_segs =
872 min_t(u16, xmit_size_goal / mss_now,
873 sk->sk_gso_max_segs);
874 xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
875 }
876 } 858 }
877 859
878 return max(xmit_size_goal, mss_now); 860 return max(size_goal, mss_now);
879} 861}
880 862
881static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) 863static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f5bd4bd3f7e6..f37ecf53ee8a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1524,6 +1524,27 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
1524 ((nonagle & TCP_NAGLE_CORK) || 1524 ((nonagle & TCP_NAGLE_CORK) ||
1525 (!nonagle && tp->packets_out && tcp_minshall_check(tp))); 1525 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
1526} 1526}
1527
1528/* Return how many segs we'd like on a TSO packet,
1529 * to send one TSO packet per ms
1530 */
1531static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now)
1532{
1533 u32 bytes, segs;
1534
1535 bytes = min(sk->sk_pacing_rate >> 10,
1536 sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
1537
1538 /* Goal is to send at least one packet per ms,
1539 * not one big TSO packet every 100 ms.
1540 * This preserves ACK clocking and is consistent
1541 * with tcp_tso_should_defer() heuristic.
1542 */
1543 segs = max_t(u32, bytes / mss_now, sysctl_tcp_min_tso_segs);
1544
1545 return min_t(u32, segs, sk->sk_gso_max_segs);
1546}
1547
1527/* Returns the portion of skb which can be sent right away */ 1548/* Returns the portion of skb which can be sent right away */
1528static unsigned int tcp_mss_split_point(const struct sock *sk, 1549static unsigned int tcp_mss_split_point(const struct sock *sk,
1529 const struct sk_buff *skb, 1550 const struct sk_buff *skb,
@@ -1731,7 +1752,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1731 * This algorithm is from John Heffner. 1752 * This algorithm is from John Heffner.
1732 */ 1753 */
1733static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, 1754static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
1734 bool *is_cwnd_limited) 1755 bool *is_cwnd_limited, u32 max_segs)
1735{ 1756{
1736 struct tcp_sock *tp = tcp_sk(sk); 1757 struct tcp_sock *tp = tcp_sk(sk);
1737 const struct inet_connection_sock *icsk = inet_csk(sk); 1758 const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1761,8 +1782,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
1761 limit = min(send_win, cong_win); 1782 limit = min(send_win, cong_win);
1762 1783
1763 /* If a full-sized TSO skb can be sent, do it. */ 1784 /* If a full-sized TSO skb can be sent, do it. */
1764 if (limit >= min_t(unsigned int, sk->sk_gso_max_size, 1785 if (limit >= max_segs * tp->mss_cache)
1765 tp->xmit_size_goal_segs * tp->mss_cache))
1766 goto send_now; 1786 goto send_now;
1767 1787
1768 /* Middle in queue won't get any more data, full sendable already? */ 1788 /* Middle in queue won't get any more data, full sendable already? */
@@ -1959,6 +1979,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1959 int cwnd_quota; 1979 int cwnd_quota;
1960 int result; 1980 int result;
1961 bool is_cwnd_limited = false; 1981 bool is_cwnd_limited = false;
1982 u32 max_segs;
1962 1983
1963 sent_pkts = 0; 1984 sent_pkts = 0;
1964 1985
@@ -1972,6 +1993,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1972 } 1993 }
1973 } 1994 }
1974 1995
1996 max_segs = tcp_tso_autosize(sk, mss_now);
1975 while ((skb = tcp_send_head(sk))) { 1997 while ((skb = tcp_send_head(sk))) {
1976 unsigned int limit; 1998 unsigned int limit;
1977 1999
@@ -2004,10 +2026,23 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2004 break; 2026 break;
2005 } else { 2027 } else {
2006 if (!push_one && 2028 if (!push_one &&
2007 tcp_tso_should_defer(sk, skb, &is_cwnd_limited)) 2029 tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
2030 max_segs))
2008 break; 2031 break;
2009 } 2032 }
2010 2033
2034 limit = mss_now;
2035 if (tso_segs > 1 && !tcp_urg_mode(tp))
2036 limit = tcp_mss_split_point(sk, skb, mss_now,
2037 min_t(unsigned int,
2038 cwnd_quota,
2039 max_segs),
2040 nonagle);
2041
2042 if (skb->len > limit &&
2043 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
2044 break;
2045
2011 /* TCP Small Queues : 2046 /* TCP Small Queues :
2012 * Control number of packets in qdisc/devices to two packets / or ~1 ms. 2047 * Control number of packets in qdisc/devices to two packets / or ~1 ms.
2013 * This allows for : 2048 * This allows for :
@@ -2018,8 +2053,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2018 * of queued bytes to ensure line rate. 2053 * of queued bytes to ensure line rate.
2019 * One example is wifi aggregation (802.11 AMPDU) 2054 * One example is wifi aggregation (802.11 AMPDU)
2020 */ 2055 */
2021 limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes, 2056 limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
2022 sk->sk_pacing_rate >> 10); 2057 limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
2023 2058
2024 if (atomic_read(&sk->sk_wmem_alloc) > limit) { 2059 if (atomic_read(&sk->sk_wmem_alloc) > limit) {
2025 set_bit(TSQ_THROTTLED, &tp->tsq_flags); 2060 set_bit(TSQ_THROTTLED, &tp->tsq_flags);
@@ -2032,18 +2067,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2032 break; 2067 break;
2033 } 2068 }
2034 2069
2035 limit = mss_now;
2036 if (tso_segs > 1 && !tcp_urg_mode(tp))
2037 limit = tcp_mss_split_point(sk, skb, mss_now,
2038 min_t(unsigned int,
2039 cwnd_quota,
2040 sk->sk_gso_max_segs),
2041 nonagle);
2042
2043 if (skb->len > limit &&
2044 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
2045 break;
2046
2047 if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) 2070 if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
2048 break; 2071 break;
2049 2072