diff options
author | Neal Cardwell <ncardwell@google.com> | 2014-05-22 10:41:08 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2014-05-22 12:04:49 -0400 |
commit | ca8a22634381537c92b5a10308652e1c38fd9edf (patch) | |
tree | 5b5c47c2298af095e33ae1a5fd9f6192e13b56e3 /net/ipv4 | |
parent | aff4b9743225d1c8a4cfa51b186bc3ad789dc8f9 (diff) |
tcp: make cwnd-limited checks measurement-based, and gentler
Experience with the recent e114a710aa50 ("tcp: fix cwnd limited
checking to improve congestion control") has shown that there are
common cases where that commit can cause cwnd to be much larger than
necessary. This leads to TSO autosizing cooking skbs that are too
large, among other things.
The main problems seemed to be:
(1) That commit attempted to predict the future behavior of the
connection by looking at the write queue (if TSO or TSQ limit
sending). That prediction sometimes overestimated future outstanding
packets.
(2) That commit always allowed cwnd to grow to twice the number of
outstanding packets (even in congestion avoidance, where this is not
needed).
This commit improves both of these, by:
(1) Switching to a measurement-based approach where we explicitly
track the largest number of packets in flight during the past window
("max_packets_out"), and remember whether we were cwnd-limited at the
moment we finished sending that flight.
(2) Only allowing cwnd to grow to twice the number of outstanding
packets ("max_packets_out") in slow start. In congestion avoidance
mode we now only allow cwnd to grow if it was fully utilized.
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/tcp_output.c | 37 |
1 files changed, 23 insertions, 14 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3d61c52bdf79..d463c35db33d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -1402,11 +1402,19 @@ static void tcp_cwnd_application_limited(struct sock *sk) | |||
1402 | tp->snd_cwnd_stamp = tcp_time_stamp; | 1402 | tp->snd_cwnd_stamp = tcp_time_stamp; |
1403 | } | 1403 | } |
1404 | 1404 | ||
1405 | static void tcp_cwnd_validate(struct sock *sk, u32 unsent_segs) | 1405 | static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) |
1406 | { | 1406 | { |
1407 | struct tcp_sock *tp = tcp_sk(sk); | 1407 | struct tcp_sock *tp = tcp_sk(sk); |
1408 | 1408 | ||
1409 | tp->lsnd_pending = tp->packets_out + unsent_segs; | 1409 | /* Track the maximum number of outstanding packets in each |
1410 | * window, and remember whether we were cwnd-limited then. | ||
1411 | */ | ||
1412 | if (!before(tp->snd_una, tp->max_packets_seq) || | ||
1413 | tp->packets_out > tp->max_packets_out) { | ||
1414 | tp->max_packets_out = tp->packets_out; | ||
1415 | tp->max_packets_seq = tp->snd_nxt; | ||
1416 | tp->is_cwnd_limited = is_cwnd_limited; | ||
1417 | } | ||
1410 | 1418 | ||
1411 | if (tcp_is_cwnd_limited(sk)) { | 1419 | if (tcp_is_cwnd_limited(sk)) { |
1412 | /* Network is feed fully. */ | 1420 | /* Network is feed fully. */ |
@@ -1660,7 +1668,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, | |||
1660 | * | 1668 | * |
1661 | * This algorithm is from John Heffner. | 1669 | * This algorithm is from John Heffner. |
1662 | */ | 1670 | */ |
1663 | static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) | 1671 | static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, |
1672 | bool *is_cwnd_limited) | ||
1664 | { | 1673 | { |
1665 | struct tcp_sock *tp = tcp_sk(sk); | 1674 | struct tcp_sock *tp = tcp_sk(sk); |
1666 | const struct inet_connection_sock *icsk = inet_csk(sk); | 1675 | const struct inet_connection_sock *icsk = inet_csk(sk); |
@@ -1724,6 +1733,9 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) | |||
1724 | if (!tp->tso_deferred) | 1733 | if (!tp->tso_deferred) |
1725 | tp->tso_deferred = 1 | (jiffies << 1); | 1734 | tp->tso_deferred = 1 | (jiffies << 1); |
1726 | 1735 | ||
1736 | if (cong_win < send_win && cong_win < skb->len) | ||
1737 | *is_cwnd_limited = true; | ||
1738 | |||
1727 | return true; | 1739 | return true; |
1728 | 1740 | ||
1729 | send_now: | 1741 | send_now: |
@@ -1881,9 +1893,10 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
1881 | { | 1893 | { |
1882 | struct tcp_sock *tp = tcp_sk(sk); | 1894 | struct tcp_sock *tp = tcp_sk(sk); |
1883 | struct sk_buff *skb; | 1895 | struct sk_buff *skb; |
1884 | unsigned int tso_segs, sent_pkts, unsent_segs = 0; | 1896 | unsigned int tso_segs, sent_pkts; |
1885 | int cwnd_quota; | 1897 | int cwnd_quota; |
1886 | int result; | 1898 | int result; |
1899 | bool is_cwnd_limited = false; | ||
1887 | 1900 | ||
1888 | sent_pkts = 0; | 1901 | sent_pkts = 0; |
1889 | 1902 | ||
@@ -1908,6 +1921,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
1908 | 1921 | ||
1909 | cwnd_quota = tcp_cwnd_test(tp, skb); | 1922 | cwnd_quota = tcp_cwnd_test(tp, skb); |
1910 | if (!cwnd_quota) { | 1923 | if (!cwnd_quota) { |
1924 | is_cwnd_limited = true; | ||
1911 | if (push_one == 2) | 1925 | if (push_one == 2) |
1912 | /* Force out a loss probe pkt. */ | 1926 | /* Force out a loss probe pkt. */ |
1913 | cwnd_quota = 1; | 1927 | cwnd_quota = 1; |
@@ -1924,8 +1938,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
1924 | nonagle : TCP_NAGLE_PUSH)))) | 1938 | nonagle : TCP_NAGLE_PUSH)))) |
1925 | break; | 1939 | break; |
1926 | } else { | 1940 | } else { |
1927 | if (!push_one && tcp_tso_should_defer(sk, skb)) | 1941 | if (!push_one && |
1928 | goto compute_unsent_segs; | 1942 | tcp_tso_should_defer(sk, skb, &is_cwnd_limited)) |
1943 | break; | ||
1929 | } | 1944 | } |
1930 | 1945 | ||
1931 | /* TCP Small Queues : | 1946 | /* TCP Small Queues : |
@@ -1950,14 +1965,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
1950 | * there is no smp_mb__after_set_bit() yet | 1965 | * there is no smp_mb__after_set_bit() yet |
1951 | */ | 1966 | */ |
1952 | smp_mb__after_clear_bit(); | 1967 | smp_mb__after_clear_bit(); |
1953 | if (atomic_read(&sk->sk_wmem_alloc) > limit) { | 1968 | if (atomic_read(&sk->sk_wmem_alloc) > limit) |
1954 | u32 unsent_bytes; | ||
1955 | |||
1956 | compute_unsent_segs: | ||
1957 | unsent_bytes = tp->write_seq - tp->snd_nxt; | ||
1958 | unsent_segs = DIV_ROUND_UP(unsent_bytes, mss_now); | ||
1959 | break; | 1969 | break; |
1960 | } | ||
1961 | } | 1970 | } |
1962 | 1971 | ||
1963 | limit = mss_now; | 1972 | limit = mss_now; |
@@ -1997,7 +2006,7 @@ repair: | |||
1997 | /* Send one loss probe per tail loss episode. */ | 2006 | /* Send one loss probe per tail loss episode. */ |
1998 | if (push_one != 2) | 2007 | if (push_one != 2) |
1999 | tcp_schedule_loss_probe(sk); | 2008 | tcp_schedule_loss_probe(sk); |
2000 | tcp_cwnd_validate(sk, unsent_segs); | 2009 | tcp_cwnd_validate(sk, is_cwnd_limited); |
2001 | return false; | 2010 | return false; |
2002 | } | 2011 | } |
2003 | return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); | 2012 | return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); |