aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorNeal Cardwell <ncardwell@google.com>2014-05-22 10:41:08 -0400
committerDavid S. Miller <davem@davemloft.net>2014-05-22 12:04:49 -0400
commitca8a22634381537c92b5a10308652e1c38fd9edf (patch)
tree5b5c47c2298af095e33ae1a5fd9f6192e13b56e3 /net/ipv4
parentaff4b9743225d1c8a4cfa51b186bc3ad789dc8f9 (diff)
tcp: make cwnd-limited checks measurement-based, and gentler
Experience with the recent e114a710aa50 ("tcp: fix cwnd limited checking to improve congestion control") has shown that there are common cases where that commit can cause cwnd to be much larger than necessary. This leads to TSO autosizing cooking skbs that are too large, among other things. The main problems seemed to be: (1) That commit attempted to predict the future behavior of the connection by looking at the write queue (if TSO or TSQ limit sending). That prediction sometimes overestimated future outstanding packets. (2) That commit always allowed cwnd to grow to twice the number of outstanding packets (even in congestion avoidance, where this is not needed). This commit improves both of these, by: (1) Switching to a measurement-based approach where we explicitly track the largest number of packets in flight during the past window ("max_packets_out"), and remember whether we were cwnd-limited at the moment we finished sending that flight. (2) Only allowing cwnd to grow to twice the number of outstanding packets ("max_packets_out") in slow start. In congestion avoidance mode we now only allow cwnd to grow if it was fully utilized. Signed-off-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/tcp_output.c37
1 files changed, 23 insertions, 14 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 3d61c52bdf79..d463c35db33d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1402,11 +1402,19 @@ static void tcp_cwnd_application_limited(struct sock *sk)
1402 tp->snd_cwnd_stamp = tcp_time_stamp; 1402 tp->snd_cwnd_stamp = tcp_time_stamp;
1403} 1403}
1404 1404
1405static void tcp_cwnd_validate(struct sock *sk, u32 unsent_segs) 1405static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
1406{ 1406{
1407 struct tcp_sock *tp = tcp_sk(sk); 1407 struct tcp_sock *tp = tcp_sk(sk);
1408 1408
1409 tp->lsnd_pending = tp->packets_out + unsent_segs; 1409 /* Track the maximum number of outstanding packets in each
1410 * window, and remember whether we were cwnd-limited then.
1411 */
1412 if (!before(tp->snd_una, tp->max_packets_seq) ||
1413 tp->packets_out > tp->max_packets_out) {
1414 tp->max_packets_out = tp->packets_out;
1415 tp->max_packets_seq = tp->snd_nxt;
1416 tp->is_cwnd_limited = is_cwnd_limited;
1417 }
1410 1418
1411 if (tcp_is_cwnd_limited(sk)) { 1419 if (tcp_is_cwnd_limited(sk)) {
1412 /* Network is feed fully. */ 1420 /* Network is feed fully. */
@@ -1660,7 +1668,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1660 * 1668 *
1661 * This algorithm is from John Heffner. 1669 * This algorithm is from John Heffner.
1662 */ 1670 */
1663static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) 1671static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
1672 bool *is_cwnd_limited)
1664{ 1673{
1665 struct tcp_sock *tp = tcp_sk(sk); 1674 struct tcp_sock *tp = tcp_sk(sk);
1666 const struct inet_connection_sock *icsk = inet_csk(sk); 1675 const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1724,6 +1733,9 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1724 if (!tp->tso_deferred) 1733 if (!tp->tso_deferred)
1725 tp->tso_deferred = 1 | (jiffies << 1); 1734 tp->tso_deferred = 1 | (jiffies << 1);
1726 1735
1736 if (cong_win < send_win && cong_win < skb->len)
1737 *is_cwnd_limited = true;
1738
1727 return true; 1739 return true;
1728 1740
1729send_now: 1741send_now:
@@ -1881,9 +1893,10 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1881{ 1893{
1882 struct tcp_sock *tp = tcp_sk(sk); 1894 struct tcp_sock *tp = tcp_sk(sk);
1883 struct sk_buff *skb; 1895 struct sk_buff *skb;
1884 unsigned int tso_segs, sent_pkts, unsent_segs = 0; 1896 unsigned int tso_segs, sent_pkts;
1885 int cwnd_quota; 1897 int cwnd_quota;
1886 int result; 1898 int result;
1899 bool is_cwnd_limited = false;
1887 1900
1888 sent_pkts = 0; 1901 sent_pkts = 0;
1889 1902
@@ -1908,6 +1921,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1908 1921
1909 cwnd_quota = tcp_cwnd_test(tp, skb); 1922 cwnd_quota = tcp_cwnd_test(tp, skb);
1910 if (!cwnd_quota) { 1923 if (!cwnd_quota) {
1924 is_cwnd_limited = true;
1911 if (push_one == 2) 1925 if (push_one == 2)
1912 /* Force out a loss probe pkt. */ 1926 /* Force out a loss probe pkt. */
1913 cwnd_quota = 1; 1927 cwnd_quota = 1;
@@ -1924,8 +1938,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1924 nonagle : TCP_NAGLE_PUSH)))) 1938 nonagle : TCP_NAGLE_PUSH))))
1925 break; 1939 break;
1926 } else { 1940 } else {
1927 if (!push_one && tcp_tso_should_defer(sk, skb)) 1941 if (!push_one &&
1928 goto compute_unsent_segs; 1942 tcp_tso_should_defer(sk, skb, &is_cwnd_limited))
1943 break;
1929 } 1944 }
1930 1945
1931 /* TCP Small Queues : 1946 /* TCP Small Queues :
@@ -1950,14 +1965,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1950 * there is no smp_mb__after_set_bit() yet 1965 * there is no smp_mb__after_set_bit() yet
1951 */ 1966 */
1952 smp_mb__after_clear_bit(); 1967 smp_mb__after_clear_bit();
1953 if (atomic_read(&sk->sk_wmem_alloc) > limit) { 1968 if (atomic_read(&sk->sk_wmem_alloc) > limit)
1954 u32 unsent_bytes;
1955
1956compute_unsent_segs:
1957 unsent_bytes = tp->write_seq - tp->snd_nxt;
1958 unsent_segs = DIV_ROUND_UP(unsent_bytes, mss_now);
1959 break; 1969 break;
1960 }
1961 } 1970 }
1962 1971
1963 limit = mss_now; 1972 limit = mss_now;
@@ -1997,7 +2006,7 @@ repair:
1997 /* Send one loss probe per tail loss episode. */ 2006 /* Send one loss probe per tail loss episode. */
1998 if (push_one != 2) 2007 if (push_one != 2)
1999 tcp_schedule_loss_probe(sk); 2008 tcp_schedule_loss_probe(sk);
2000 tcp_cwnd_validate(sk, unsent_segs); 2009 tcp_cwnd_validate(sk, is_cwnd_limited);
2001 return false; 2010 return false;
2002 } 2011 }
2003 return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); 2012 return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));