aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeal Cardwell <ncardwell@google.com>2014-05-22 10:41:08 -0400
committerDavid S. Miller <davem@davemloft.net>2014-05-22 12:04:49 -0400
commitca8a22634381537c92b5a10308652e1c38fd9edf (patch)
tree5b5c47c2298af095e33ae1a5fd9f6192e13b56e3
parentaff4b9743225d1c8a4cfa51b186bc3ad789dc8f9 (diff)
tcp: make cwnd-limited checks measurement-based, and gentler
Experience with the recent e114a710aa50 ("tcp: fix cwnd limited checking to improve congestion control") has shown that there are common cases where that commit can cause cwnd to be much larger than necessary. This leads to TSO autosizing cooking skbs that are too large, among other things. The main problems seemed to be: (1) That commit attempted to predict the future behavior of the connection by looking at the write queue (if TSO or TSQ limit sending). That prediction sometimes overestimated future outstanding packets. (2) That commit always allowed cwnd to grow to twice the number of outstanding packets (even in congestion avoidance, where this is not needed). This commit improves both of these, by: (1) Switching to a measurement-based approach where we explicitly track the largest number of packets in flight during the past window ("max_packets_out"), and remember whether we were cwnd-limited at the moment we finished sending that flight. (2) Only allowing cwnd to grow to twice the number of outstanding packets ("max_packets_out") in slow start. In congestion avoidance mode we now only allow cwnd to grow if it was fully utilized. Signed-off-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/linux/tcp.h6
-rw-r--r--include/net/tcp.h11
-rw-r--r--net/ipv4/tcp_output.c37
3 files changed, 35 insertions, 19 deletions
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index bc35e4709e8e..a0513210798f 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -197,7 +197,8 @@ struct tcp_sock {
197 u8 do_early_retrans:1,/* Enable RFC5827 early-retransmit */ 197 u8 do_early_retrans:1,/* Enable RFC5827 early-retransmit */
198 syn_data:1, /* SYN includes data */ 198 syn_data:1, /* SYN includes data */
199 syn_fastopen:1, /* SYN includes Fast Open option */ 199 syn_fastopen:1, /* SYN includes Fast Open option */
200 syn_data_acked:1;/* data in SYN is acked by SYN-ACK */ 200 syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
201 is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
201 u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */ 202 u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */
202 203
203/* RTT measurement */ 204/* RTT measurement */
@@ -209,6 +210,8 @@ struct tcp_sock {
209 210
210 u32 packets_out; /* Packets which are "in flight" */ 211 u32 packets_out; /* Packets which are "in flight" */
211 u32 retrans_out; /* Retransmitted packets out */ 212 u32 retrans_out; /* Retransmitted packets out */
213 u32 max_packets_out; /* max packets_out in last window */
214 u32 max_packets_seq; /* right edge of max_packets_out flight */
212 215
213 u16 urg_data; /* Saved octet of OOB data and control flags */ 216 u16 urg_data; /* Saved octet of OOB data and control flags */
214 u8 ecn_flags; /* ECN status bits. */ 217 u8 ecn_flags; /* ECN status bits. */
@@ -230,7 +233,6 @@ struct tcp_sock {
230 u32 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */ 233 u32 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
231 u32 snd_cwnd_used; 234 u32 snd_cwnd_used;
232 u32 snd_cwnd_stamp; 235 u32 snd_cwnd_stamp;
233 u32 lsnd_pending; /* packets inflight or unsent since last xmit */
234 u32 prior_cwnd; /* Congestion window at start of Recovery. */ 236 u32 prior_cwnd; /* Congestion window at start of Recovery. */
235 u32 prr_delivered; /* Number of newly delivered packets to 237 u32 prr_delivered; /* Number of newly delivered packets to
236 * receiver in Recovery. */ 238 * receiver in Recovery. */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index f5d6ca4a9d28..e80abe4486cb 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -971,8 +971,9 @@ static inline u32 tcp_wnd_end(const struct tcp_sock *tp)
971 971
972/* We follow the spirit of RFC2861 to validate cwnd but implement a more 972/* We follow the spirit of RFC2861 to validate cwnd but implement a more
973 * flexible approach. The RFC suggests cwnd should not be raised unless 973 * flexible approach. The RFC suggests cwnd should not be raised unless
974 * it was fully used previously. But we allow cwnd to grow as long as the 974 * it was fully used previously. And that's exactly what we do in
975 * application has used half the cwnd. 975 * congestion avoidance mode. But in slow start we allow cwnd to grow
976 * as long as the application has used half the cwnd.
976 * Example : 977 * Example :
977 * cwnd is 10 (IW10), but application sends 9 frames. 978 * cwnd is 10 (IW10), but application sends 9 frames.
978 * We allow cwnd to reach 18 when all frames are ACKed. 979 * We allow cwnd to reach 18 when all frames are ACKed.
@@ -985,7 +986,11 @@ static inline bool tcp_is_cwnd_limited(const struct sock *sk)
985{ 986{
986 const struct tcp_sock *tp = tcp_sk(sk); 987 const struct tcp_sock *tp = tcp_sk(sk);
987 988
988 return tp->snd_cwnd < 2 * tp->lsnd_pending; 989 /* If in slow start, ensure cwnd grows to twice what was ACKed. */
990 if (tp->snd_cwnd <= tp->snd_ssthresh)
991 return tp->snd_cwnd < 2 * tp->max_packets_out;
992
993 return tp->is_cwnd_limited;
989} 994}
990 995
991static inline void tcp_check_probe_timer(struct sock *sk) 996static inline void tcp_check_probe_timer(struct sock *sk)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 3d61c52bdf79..d463c35db33d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1402,11 +1402,19 @@ static void tcp_cwnd_application_limited(struct sock *sk)
1402 tp->snd_cwnd_stamp = tcp_time_stamp; 1402 tp->snd_cwnd_stamp = tcp_time_stamp;
1403} 1403}
1404 1404
1405static void tcp_cwnd_validate(struct sock *sk, u32 unsent_segs) 1405static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
1406{ 1406{
1407 struct tcp_sock *tp = tcp_sk(sk); 1407 struct tcp_sock *tp = tcp_sk(sk);
1408 1408
1409 tp->lsnd_pending = tp->packets_out + unsent_segs; 1409 /* Track the maximum number of outstanding packets in each
1410 * window, and remember whether we were cwnd-limited then.
1411 */
1412 if (!before(tp->snd_una, tp->max_packets_seq) ||
1413 tp->packets_out > tp->max_packets_out) {
1414 tp->max_packets_out = tp->packets_out;
1415 tp->max_packets_seq = tp->snd_nxt;
1416 tp->is_cwnd_limited = is_cwnd_limited;
1417 }
1410 1418
1411 if (tcp_is_cwnd_limited(sk)) { 1419 if (tcp_is_cwnd_limited(sk)) {
1412 /* Network is feed fully. */ 1420 /* Network is feed fully. */
@@ -1660,7 +1668,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1660 * 1668 *
1661 * This algorithm is from John Heffner. 1669 * This algorithm is from John Heffner.
1662 */ 1670 */
1663static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) 1671static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
1672 bool *is_cwnd_limited)
1664{ 1673{
1665 struct tcp_sock *tp = tcp_sk(sk); 1674 struct tcp_sock *tp = tcp_sk(sk);
1666 const struct inet_connection_sock *icsk = inet_csk(sk); 1675 const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1724,6 +1733,9 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1724 if (!tp->tso_deferred) 1733 if (!tp->tso_deferred)
1725 tp->tso_deferred = 1 | (jiffies << 1); 1734 tp->tso_deferred = 1 | (jiffies << 1);
1726 1735
1736 if (cong_win < send_win && cong_win < skb->len)
1737 *is_cwnd_limited = true;
1738
1727 return true; 1739 return true;
1728 1740
1729send_now: 1741send_now:
@@ -1881,9 +1893,10 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1881{ 1893{
1882 struct tcp_sock *tp = tcp_sk(sk); 1894 struct tcp_sock *tp = tcp_sk(sk);
1883 struct sk_buff *skb; 1895 struct sk_buff *skb;
1884 unsigned int tso_segs, sent_pkts, unsent_segs = 0; 1896 unsigned int tso_segs, sent_pkts;
1885 int cwnd_quota; 1897 int cwnd_quota;
1886 int result; 1898 int result;
1899 bool is_cwnd_limited = false;
1887 1900
1888 sent_pkts = 0; 1901 sent_pkts = 0;
1889 1902
@@ -1908,6 +1921,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1908 1921
1909 cwnd_quota = tcp_cwnd_test(tp, skb); 1922 cwnd_quota = tcp_cwnd_test(tp, skb);
1910 if (!cwnd_quota) { 1923 if (!cwnd_quota) {
1924 is_cwnd_limited = true;
1911 if (push_one == 2) 1925 if (push_one == 2)
1912 /* Force out a loss probe pkt. */ 1926 /* Force out a loss probe pkt. */
1913 cwnd_quota = 1; 1927 cwnd_quota = 1;
@@ -1924,8 +1938,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1924 nonagle : TCP_NAGLE_PUSH)))) 1938 nonagle : TCP_NAGLE_PUSH))))
1925 break; 1939 break;
1926 } else { 1940 } else {
1927 if (!push_one && tcp_tso_should_defer(sk, skb)) 1941 if (!push_one &&
1928 goto compute_unsent_segs; 1942 tcp_tso_should_defer(sk, skb, &is_cwnd_limited))
1943 break;
1929 } 1944 }
1930 1945
1931 /* TCP Small Queues : 1946 /* TCP Small Queues :
@@ -1950,14 +1965,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1950 * there is no smp_mb__after_set_bit() yet 1965 * there is no smp_mb__after_set_bit() yet
1951 */ 1966 */
1952 smp_mb__after_clear_bit(); 1967 smp_mb__after_clear_bit();
1953 if (atomic_read(&sk->sk_wmem_alloc) > limit) { 1968 if (atomic_read(&sk->sk_wmem_alloc) > limit)
1954 u32 unsent_bytes;
1955
1956compute_unsent_segs:
1957 unsent_bytes = tp->write_seq - tp->snd_nxt;
1958 unsent_segs = DIV_ROUND_UP(unsent_bytes, mss_now);
1959 break; 1969 break;
1960 }
1961 } 1970 }
1962 1971
1963 limit = mss_now; 1972 limit = mss_now;
@@ -1997,7 +2006,7 @@ repair:
1997 /* Send one loss probe per tail loss episode. */ 2006 /* Send one loss probe per tail loss episode. */
1998 if (push_one != 2) 2007 if (push_one != 2)
1999 tcp_schedule_loss_probe(sk); 2008 tcp_schedule_loss_probe(sk);
2000 tcp_cwnd_validate(sk, unsent_segs); 2009 tcp_cwnd_validate(sk, is_cwnd_limited);
2001 return false; 2010 return false;
2002 } 2011 }
2003 return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); 2012 return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));