aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2014-04-30 14:58:13 -0400
committerDavid S. Miller <davem@davemloft.net>2014-05-02 17:54:35 -0400
commite114a710aa5058c0ba4aa1dfb105132aefeb5e04 (patch)
tree3d7c656358bbc5cd37f7c2a973923e6be6ced1d9 /net/ipv4
parent4e8bbb819d1594a01f91b1de83321f68d3e6e245 (diff)
tcp: fix cwnd limited checking to improve congestion control
Yuchung discovered tcp_is_cwnd_limited() was returning false in slow start phase even if the application filled the socket write queue. All congestion modules take into account tcp_is_cwnd_limited() before increasing cwnd, so this behavior limits slow start from probing the bandwidth at full speed. The problem is that even if write queue is full (aka we are _not_ application limited), cwnd can be under utilized if TSO should auto defer or TCP Small queues decided to hold packets. So the in_flight can be kept to smaller value, and we can get to the point tcp_is_cwnd_limited() returns false. With TCP Small Queues and FQ/pacing, this issue is more visible. We fix this by having tcp_cwnd_validate(), which is supposed to track such things, take into account unsent_segs, the number of segs that we are not sending at the moment due to TSO or TSQ, but intend to send real soon. Then when we are cwnd-limited, remember this fact while we are processing the window of ACKs that comes back. For example, suppose we have a brand new connection with cwnd=10; we are in slow start, and we send a flight of 9 packets. By the time we have received ACKs for all 9 packets we want our cwnd to be 18. We implement this by setting tp->lsnd_pending to 9, and considering ourselves to be cwnd-limited while cwnd is less than twice tp->lsnd_pending (2*9 -> 18). This makes tcp_is_cwnd_limited() more understandable, by removing the GSO/TSO kludge, that tried to work around the issue. Note the in_flight parameter can be removed in a followup cleanup patch. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/tcp_cong.c20
-rw-r--r--net/ipv4/tcp_output.c21
2 files changed, 14 insertions, 27 deletions
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 2b9464c93b88..a93b41ba05ff 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -276,26 +276,6 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
276 return err; 276 return err;
277} 277}
278 278
279/* RFC2861 Check whether we are limited by application or congestion window
280 * This is the inverse of cwnd check in tcp_tso_should_defer
281 */
282bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
283{
284 const struct tcp_sock *tp = tcp_sk(sk);
285 u32 left;
286
287 if (in_flight >= tp->snd_cwnd)
288 return true;
289
290 left = tp->snd_cwnd - in_flight;
291 if (sk_can_gso(sk) &&
292 left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd &&
293 left < tp->xmit_size_goal_segs)
294 return true;
295 return left <= tcp_max_tso_deferred_mss(tp);
296}
297EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited);
298
299/* Slow start is used when congestion window is no greater than the slow start 279/* Slow start is used when congestion window is no greater than the slow start
300 * threshold. We base on RFC2581 and also handle stretch ACKs properly. 280 * threshold. We base on RFC2581 and also handle stretch ACKs properly.
301 * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but 281 * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 20847de991ea..f9181a133462 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1402,12 +1402,13 @@ static void tcp_cwnd_application_limited(struct sock *sk)
1402 tp->snd_cwnd_stamp = tcp_time_stamp; 1402 tp->snd_cwnd_stamp = tcp_time_stamp;
1403} 1403}
1404 1404
1405/* Congestion window validation. (RFC2861) */ 1405static void tcp_cwnd_validate(struct sock *sk, u32 unsent_segs)
1406static void tcp_cwnd_validate(struct sock *sk)
1407{ 1406{
1408 struct tcp_sock *tp = tcp_sk(sk); 1407 struct tcp_sock *tp = tcp_sk(sk);
1409 1408
1410 if (tp->packets_out >= tp->snd_cwnd) { 1409 tp->lsnd_pending = tp->packets_out + unsent_segs;
1410
1411 if (tcp_is_cwnd_limited(sk, 0)) {
1411 /* Network is feed fully. */ 1412 /* Network is feed fully. */
1412 tp->snd_cwnd_used = 0; 1413 tp->snd_cwnd_used = 0;
1413 tp->snd_cwnd_stamp = tcp_time_stamp; 1414 tp->snd_cwnd_stamp = tcp_time_stamp;
@@ -1880,7 +1881,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1880{ 1881{
1881 struct tcp_sock *tp = tcp_sk(sk); 1882 struct tcp_sock *tp = tcp_sk(sk);
1882 struct sk_buff *skb; 1883 struct sk_buff *skb;
1883 unsigned int tso_segs, sent_pkts; 1884 unsigned int tso_segs, sent_pkts, unsent_segs = 0;
1884 int cwnd_quota; 1885 int cwnd_quota;
1885 int result; 1886 int result;
1886 1887
@@ -1924,7 +1925,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1924 break; 1925 break;
1925 } else { 1926 } else {
1926 if (!push_one && tcp_tso_should_defer(sk, skb)) 1927 if (!push_one && tcp_tso_should_defer(sk, skb))
1927 break; 1928 goto compute_unsent_segs;
1928 } 1929 }
1929 1930
1930 /* TCP Small Queues : 1931 /* TCP Small Queues :
@@ -1949,8 +1950,14 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1949 * there is no smp_mb__after_set_bit() yet 1950 * there is no smp_mb__after_set_bit() yet
1950 */ 1951 */
1951 smp_mb__after_clear_bit(); 1952 smp_mb__after_clear_bit();
1952 if (atomic_read(&sk->sk_wmem_alloc) > limit) 1953 if (atomic_read(&sk->sk_wmem_alloc) > limit) {
1954 u32 unsent_bytes;
1955
1956compute_unsent_segs:
1957 unsent_bytes = tp->write_seq - tp->snd_nxt;
1958 unsent_segs = DIV_ROUND_UP(unsent_bytes, mss_now);
1953 break; 1959 break;
1960 }
1954 } 1961 }
1955 1962
1956 limit = mss_now; 1963 limit = mss_now;
@@ -1990,7 +1997,7 @@ repair:
1990 /* Send one loss probe per tail loss episode. */ 1997 /* Send one loss probe per tail loss episode. */
1991 if (push_one != 2) 1998 if (push_one != 2)
1992 tcp_schedule_loss_probe(sk); 1999 tcp_schedule_loss_probe(sk);
1993 tcp_cwnd_validate(sk); 2000 tcp_cwnd_validate(sk, unsent_segs);
1994 return false; 2001 return false;
1995 } 2002 }
1996 return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); 2003 return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));