aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2013-09-27 06:28:54 -0400
committerDavid S. Miller <davem@davemloft.net>2013-09-30 23:41:57 -0400
commitc9eeec26e32e087359160406f96e0949b3cc6f10 (patch)
treece64400c99dcd748411df462701b9609e496afd5 /net/ipv4
parentbf0ea6380724beb64f27a722dfc4b0edabff816e (diff)
tcp: TSQ can use a dynamic limit
When TCP Small Queues was added, we used a sysctl to limit amount of packets queues on Qdisc/device queues for a given TCP flow. Problem is this limit is either too big for low rates, or too small for high rates. Now TCP stack has rate estimation in sk->sk_pacing_rate, and TSO auto sizing, it can better control number of packets in Qdisc/device queues. New limit is two packets or at least 1 to 2 ms worth of packets. Low rates flows benefit from this patch by having even smaller number of packets in queues, allowing for faster recovery, better RTT estimations. High rates flows benefit from this patch by allowing more than 2 packets in flight as we had reports this was a limiting factor to reach line rate. [ In particular if TX completion is delayed because of coalescing parameters ] Example for a single flow on 10Gbp link controlled by FQ/pacing 14 packets in flight instead of 2 $ tc -s -d qd qdisc fq 8001: dev eth0 root refcnt 32 limit 10000p flow_limit 100p buckets 1024 quantum 3028 initial_quantum 15140 Sent 1168459366606 bytes 771822841 pkt (dropped 0, overlimits 0 requeues 6822476) rate 9346Mbit 771713pps backlog 953820b 14p requeues 6822476 2047 flow, 2046 inactive, 1 throttled, delay 15673 ns 2372 gc, 0 highprio, 0 retrans, 9739249 throttled, 0 flows_plimit Note that sk_pacing_rate is currently set to twice the actual rate, but this might be refined in the future when a flow is in congestion avoidance. Additional change : skb->destructor should be set to tcp_wfree(). A future patch (for linux 3.13+) might remove tcp_limit_output_bytes Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Wei Liu <wei.liu2@citrix.com> Cc: Cong Wang <xiyou.wangcong@gmail.com> Cc: Yuchung Cheng <ycheng@google.com> Cc: Neal Cardwell <ncardwell@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/tcp_output.c17
1 files changed, 11 insertions, 6 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7c83cb8bf137..e6bb8256e59f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -895,8 +895,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
895 895
896 skb_orphan(skb); 896 skb_orphan(skb);
897 skb->sk = sk; 897 skb->sk = sk;
898 skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ? 898 skb->destructor = tcp_wfree;
899 tcp_wfree : sock_wfree;
900 atomic_add(skb->truesize, &sk->sk_wmem_alloc); 899 atomic_add(skb->truesize, &sk->sk_wmem_alloc);
901 900
902 /* Build TCP header and checksum it. */ 901 /* Build TCP header and checksum it. */
@@ -1840,7 +1839,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1840 while ((skb = tcp_send_head(sk))) { 1839 while ((skb = tcp_send_head(sk))) {
1841 unsigned int limit; 1840 unsigned int limit;
1842 1841
1843
1844 tso_segs = tcp_init_tso_segs(sk, skb, mss_now); 1842 tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
1845 BUG_ON(!tso_segs); 1843 BUG_ON(!tso_segs);
1846 1844
@@ -1869,13 +1867,20 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1869 break; 1867 break;
1870 } 1868 }
1871 1869
1872 /* TSQ : sk_wmem_alloc accounts skb truesize, 1870 /* TCP Small Queues :
1873 * including skb overhead. But thats OK. 1871 * Control number of packets in qdisc/devices to two packets / or ~1 ms.
1872 * This allows for :
1873 * - better RTT estimation and ACK scheduling
1874 * - faster recovery
1875 * - high rates
1874 */ 1876 */
1875 if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) { 1877 limit = max(skb->truesize, sk->sk_pacing_rate >> 10);
1878
1879 if (atomic_read(&sk->sk_wmem_alloc) > limit) {
1876 set_bit(TSQ_THROTTLED, &tp->tsq_flags); 1880 set_bit(TSQ_THROTTLED, &tp->tsq_flags);
1877 break; 1881 break;
1878 } 1882 }
1883
1879 limit = mss_now; 1884 limit = mss_now;
1880 if (tso_segs > 1 && !tcp_urg_mode(tp)) 1885 if (tso_segs > 1 && !tcp_urg_mode(tp))
1881 limit = tcp_mss_split_point(sk, skb, mss_now, 1886 limit = tcp_mss_split_point(sk, skb, mss_now,