diff options
| author | Marcelo Leitner <mleitner@redhat.com> | 2014-11-04 14:15:08 -0500 |
|---|---|---|
| committer | David S. Miller <davem@davemloft.net> | 2014-11-05 16:59:49 -0500 |
| commit | 1f37bf87aa7523d28e7e4c4f7bb5dba98faa3e00 (patch) | |
| tree | 587ea2b8eaff79f04b9f6cadce4089c1dd765b32 | |
| parent | 46d3802627f60be1c17659a44c8d2d7a5e247023 (diff) | |
tcp: zero retrans_stamp if all retrans were acked
Ueki Kohei reported that when we are using NewReno with connections that
have a very low traffic, we may timeout the connection too early if a
second loss occurs after the first one was successfully acked but no
data was transfered later. Below is his description of it:
When SACK is disabled, and a socket suffers multiple separate TCP
retransmissions, that socket's ETIMEDOUT value is calculated from the
time of the *first* retransmission instead of the *latest*
retransmission.
This happens because the tcp_sock's retrans_stamp is set once then never
cleared.
Take the following connection:
Linux remote-machine
| |
send#1---->(*1)|--------> data#1 --------->|
| | |
RTO : :
| | |
---(*2)|----> data#1(retrans) ---->|
| (*3)|<---------- ACK <----------|
| | |
| : :
| : :
| : :
16 minutes (or more) :
| : :
| : :
| : :
| | |
send#2---->(*4)|--------> data#2 --------->|
| | |
RTO : :
| | |
---(*5)|----> data#2(retrans) ---->|
| | |
| | |
RTO*2 : :
| | |
| | |
ETIMEDOUT<----(*6)| |
(*1) One data packet sent.
(*2) Because no ACK packet is received, the packet is retransmitted.
(*3) The ACK packet is received. The transmitted packet is acknowledged.
At this point the first "retransmission event" has passed and been
recovered from. Any future retransmission is a completely new "event".
(*4) After 16 minutes (to correspond with retries2=15), a new data
packet is sent. Note: No data is transmitted between (*3) and (*4).
The socket's timeout SHOULD be calculated from this point in time, but
instead it's calculated from the prior "event" 16 minutes ago.
(*5) Because no ACK packet is received, the packet is retransmitted.
(*6) At the time of the 2nd retransmission, the socket returns
ETIMEDOUT.
Therefore, now we clear retrans_stamp as soon as all data during the
loss window is fully acked.
Reported-by: Ueki Kohei
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Marcelo Ricardo Leitner <mleitner@redhat.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Tested-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
| -rw-r--r-- | net/ipv4/tcp_input.c | 60 |
1 files changed, 31 insertions, 29 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a12b455928e5..88fa2d160685 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
| @@ -2315,6 +2315,35 @@ static inline bool tcp_packet_delayed(const struct tcp_sock *tp) | |||
| 2315 | 2315 | ||
| 2316 | /* Undo procedures. */ | 2316 | /* Undo procedures. */ |
| 2317 | 2317 | ||
| 2318 | /* We can clear retrans_stamp when there are no retransmissions in the | ||
| 2319 | * window. It would seem that it is trivially available for us in | ||
| 2320 | * tp->retrans_out, however, that kind of assumptions doesn't consider | ||
| 2321 | * what will happen if errors occur when sending retransmission for the | ||
| 2322 | * second time. ...It could the that such segment has only | ||
| 2323 | * TCPCB_EVER_RETRANS set at the present time. It seems that checking | ||
| 2324 | * the head skb is enough except for some reneging corner cases that | ||
| 2325 | * are not worth the effort. | ||
| 2326 | * | ||
| 2327 | * Main reason for all this complexity is the fact that connection dying | ||
| 2328 | * time now depends on the validity of the retrans_stamp, in particular, | ||
| 2329 | * that successive retransmissions of a segment must not advance | ||
| 2330 | * retrans_stamp under any conditions. | ||
| 2331 | */ | ||
| 2332 | static bool tcp_any_retrans_done(const struct sock *sk) | ||
| 2333 | { | ||
| 2334 | const struct tcp_sock *tp = tcp_sk(sk); | ||
| 2335 | struct sk_buff *skb; | ||
| 2336 | |||
| 2337 | if (tp->retrans_out) | ||
| 2338 | return true; | ||
| 2339 | |||
| 2340 | skb = tcp_write_queue_head(sk); | ||
| 2341 | if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) | ||
| 2342 | return true; | ||
| 2343 | |||
| 2344 | return false; | ||
| 2345 | } | ||
| 2346 | |||
| 2318 | #if FASTRETRANS_DEBUG > 1 | 2347 | #if FASTRETRANS_DEBUG > 1 |
| 2319 | static void DBGUNDO(struct sock *sk, const char *msg) | 2348 | static void DBGUNDO(struct sock *sk, const char *msg) |
| 2320 | { | 2349 | { |
| @@ -2410,6 +2439,8 @@ static bool tcp_try_undo_recovery(struct sock *sk) | |||
| 2410 | * is ACKed. For Reno it is MUST to prevent false | 2439 | * is ACKed. For Reno it is MUST to prevent false |
| 2411 | * fast retransmits (RFC2582). SACK TCP is safe. */ | 2440 | * fast retransmits (RFC2582). SACK TCP is safe. */ |
| 2412 | tcp_moderate_cwnd(tp); | 2441 | tcp_moderate_cwnd(tp); |
| 2442 | if (!tcp_any_retrans_done(sk)) | ||
| 2443 | tp->retrans_stamp = 0; | ||
| 2413 | return true; | 2444 | return true; |
| 2414 | } | 2445 | } |
| 2415 | tcp_set_ca_state(sk, TCP_CA_Open); | 2446 | tcp_set_ca_state(sk, TCP_CA_Open); |
| @@ -2430,35 +2461,6 @@ static bool tcp_try_undo_dsack(struct sock *sk) | |||
| 2430 | return false; | 2461 | return false; |
| 2431 | } | 2462 | } |
| 2432 | 2463 | ||
| 2433 | /* We can clear retrans_stamp when there are no retransmissions in the | ||
| 2434 | * window. It would seem that it is trivially available for us in | ||
| 2435 | * tp->retrans_out, however, that kind of assumptions doesn't consider | ||
| 2436 | * what will happen if errors occur when sending retransmission for the | ||
| 2437 | * second time. ...It could the that such segment has only | ||
| 2438 | * TCPCB_EVER_RETRANS set at the present time. It seems that checking | ||
| 2439 | * the head skb is enough except for some reneging corner cases that | ||
| 2440 | * are not worth the effort. | ||
| 2441 | * | ||
| 2442 | * Main reason for all this complexity is the fact that connection dying | ||
| 2443 | * time now depends on the validity of the retrans_stamp, in particular, | ||
| 2444 | * that successive retransmissions of a segment must not advance | ||
| 2445 | * retrans_stamp under any conditions. | ||
| 2446 | */ | ||
| 2447 | static bool tcp_any_retrans_done(const struct sock *sk) | ||
| 2448 | { | ||
| 2449 | const struct tcp_sock *tp = tcp_sk(sk); | ||
| 2450 | struct sk_buff *skb; | ||
| 2451 | |||
| 2452 | if (tp->retrans_out) | ||
| 2453 | return true; | ||
| 2454 | |||
| 2455 | skb = tcp_write_queue_head(sk); | ||
| 2456 | if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) | ||
| 2457 | return true; | ||
| 2458 | |||
| 2459 | return false; | ||
| 2460 | } | ||
| 2461 | |||
| 2462 | /* Undo during loss recovery after partial ACK or using F-RTO. */ | 2464 | /* Undo during loss recovery after partial ACK or using F-RTO. */ |
| 2463 | static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) | 2465 | static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) |
| 2464 | { | 2466 | { |
