diff options
author | Eric Dumazet <edumazet@google.com> | 2014-04-30 14:58:13 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2014-05-02 17:54:35 -0400 |
commit | e114a710aa5058c0ba4aa1dfb105132aefeb5e04 (patch) | |
tree | 3d7c656358bbc5cd37f7c2a973923e6be6ced1d9 /net/ipv4 | |
parent | 4e8bbb819d1594a01f91b1de83321f68d3e6e245 (diff) |
tcp: fix cwnd limited checking to improve congestion control
Yuchung discovered tcp_is_cwnd_limited() was returning false in
slow start phase even if the application filled the socket write queue.
All congestion modules take into account tcp_is_cwnd_limited()
before increasing cwnd, so this behavior limits slow start from
probing the bandwidth at full speed.
The problem is that even if write queue is full (aka we are _not_
application limited), cwnd can be under utilized if TSO should auto
defer or TCP Small queues decided to hold packets.
So the in_flight can be kept to smaller value, and we can get to the
point tcp_is_cwnd_limited() returns false.
With TCP Small Queues and FQ/pacing, this issue is more visible.
We fix this by having tcp_cwnd_validate(), which is supposed to track
such things, take into account unsent_segs, the number of segs that we
are not sending at the moment due to TSO or TSQ, but intend to send
real soon. Then when we are cwnd-limited, remember this fact while we
are processing the window of ACKs that comes back.
For example, suppose we have a brand new connection with cwnd=10; we
are in slow start, and we send a flight of 9 packets. By the time we
have received ACKs for all 9 packets we want our cwnd to be 18.
We implement this by setting tp->lsnd_pending to 9, and
considering ourselves to be cwnd-limited while cwnd is less than
twice tp->lsnd_pending (2*9 -> 18).
This makes tcp_is_cwnd_limited() more understandable, by removing
the GSO/TSO kludge, that tried to work around the issue.
Note the in_flight parameter can be removed in a followup cleanup
patch.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/tcp_cong.c | 20 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 21 |
2 files changed, 14 insertions, 27 deletions
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 2b9464c93b88..a93b41ba05ff 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c | |||
@@ -276,26 +276,6 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) | |||
276 | return err; | 276 | return err; |
277 | } | 277 | } |
278 | 278 | ||
279 | /* RFC2861 Check whether we are limited by application or congestion window | ||
280 | * This is the inverse of cwnd check in tcp_tso_should_defer | ||
281 | */ | ||
282 | bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) | ||
283 | { | ||
284 | const struct tcp_sock *tp = tcp_sk(sk); | ||
285 | u32 left; | ||
286 | |||
287 | if (in_flight >= tp->snd_cwnd) | ||
288 | return true; | ||
289 | |||
290 | left = tp->snd_cwnd - in_flight; | ||
291 | if (sk_can_gso(sk) && | ||
292 | left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd && | ||
293 | left < tp->xmit_size_goal_segs) | ||
294 | return true; | ||
295 | return left <= tcp_max_tso_deferred_mss(tp); | ||
296 | } | ||
297 | EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited); | ||
298 | |||
299 | /* Slow start is used when congestion window is no greater than the slow start | 279 | /* Slow start is used when congestion window is no greater than the slow start |
300 | * threshold. We base on RFC2581 and also handle stretch ACKs properly. | 280 | * threshold. We base on RFC2581 and also handle stretch ACKs properly. |
301 | * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but | 281 | * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but |
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 20847de991ea..f9181a133462 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -1402,12 +1402,13 @@ static void tcp_cwnd_application_limited(struct sock *sk) | |||
1402 | tp->snd_cwnd_stamp = tcp_time_stamp; | 1402 | tp->snd_cwnd_stamp = tcp_time_stamp; |
1403 | } | 1403 | } |
1404 | 1404 | ||
1405 | /* Congestion window validation. (RFC2861) */ | 1405 | static void tcp_cwnd_validate(struct sock *sk, u32 unsent_segs) |
1406 | static void tcp_cwnd_validate(struct sock *sk) | ||
1407 | { | 1406 | { |
1408 | struct tcp_sock *tp = tcp_sk(sk); | 1407 | struct tcp_sock *tp = tcp_sk(sk); |
1409 | 1408 | ||
1410 | if (tp->packets_out >= tp->snd_cwnd) { | 1409 | tp->lsnd_pending = tp->packets_out + unsent_segs; |
1410 | |||
1411 | if (tcp_is_cwnd_limited(sk, 0)) { | ||
1411 | /* Network is feed fully. */ | 1412 | /* Network is feed fully. */ |
1412 | tp->snd_cwnd_used = 0; | 1413 | tp->snd_cwnd_used = 0; |
1413 | tp->snd_cwnd_stamp = tcp_time_stamp; | 1414 | tp->snd_cwnd_stamp = tcp_time_stamp; |
@@ -1880,7 +1881,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
1880 | { | 1881 | { |
1881 | struct tcp_sock *tp = tcp_sk(sk); | 1882 | struct tcp_sock *tp = tcp_sk(sk); |
1882 | struct sk_buff *skb; | 1883 | struct sk_buff *skb; |
1883 | unsigned int tso_segs, sent_pkts; | 1884 | unsigned int tso_segs, sent_pkts, unsent_segs = 0; |
1884 | int cwnd_quota; | 1885 | int cwnd_quota; |
1885 | int result; | 1886 | int result; |
1886 | 1887 | ||
@@ -1924,7 +1925,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
1924 | break; | 1925 | break; |
1925 | } else { | 1926 | } else { |
1926 | if (!push_one && tcp_tso_should_defer(sk, skb)) | 1927 | if (!push_one && tcp_tso_should_defer(sk, skb)) |
1927 | break; | 1928 | goto compute_unsent_segs; |
1928 | } | 1929 | } |
1929 | 1930 | ||
1930 | /* TCP Small Queues : | 1931 | /* TCP Small Queues : |
@@ -1949,8 +1950,14 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
1949 | * there is no smp_mb__after_set_bit() yet | 1950 | * there is no smp_mb__after_set_bit() yet |
1950 | */ | 1951 | */ |
1951 | smp_mb__after_clear_bit(); | 1952 | smp_mb__after_clear_bit(); |
1952 | if (atomic_read(&sk->sk_wmem_alloc) > limit) | 1953 | if (atomic_read(&sk->sk_wmem_alloc) > limit) { |
1954 | u32 unsent_bytes; | ||
1955 | |||
1956 | compute_unsent_segs: | ||
1957 | unsent_bytes = tp->write_seq - tp->snd_nxt; | ||
1958 | unsent_segs = DIV_ROUND_UP(unsent_bytes, mss_now); | ||
1953 | break; | 1959 | break; |
1960 | } | ||
1954 | } | 1961 | } |
1955 | 1962 | ||
1956 | limit = mss_now; | 1963 | limit = mss_now; |
@@ -1990,7 +1997,7 @@ repair: | |||
1990 | /* Send one loss probe per tail loss episode. */ | 1997 | /* Send one loss probe per tail loss episode. */ |
1991 | if (push_one != 2) | 1998 | if (push_one != 2) |
1992 | tcp_schedule_loss_probe(sk); | 1999 | tcp_schedule_loss_probe(sk); |
1993 | tcp_cwnd_validate(sk); | 2000 | tcp_cwnd_validate(sk, unsent_segs); |
1994 | return false; | 2001 | return false; |
1995 | } | 2002 | } |
1996 | return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); | 2003 | return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); |