aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
authorNeal Cardwell <ncardwell@google.com>2017-08-03 09:19:54 -0400
committerDavid S. Miller <davem@davemloft.net>2017-08-03 18:38:31 -0400
commitdf92c8394e6ea0469e8056946ef8add740ab8046 (patch)
tree423c7984d203560f17751f0ba8ba9f0a8ff2ae4e /net/ipv4/tcp_input.c
parenta2815817ffa68c7933a43eb55836d6e789bd4389 (diff)
tcp: fix xmit timer to only be reset if data ACKed/SACKed
Fix a TCP loss recovery performance bug raised recently on the netdev list, in two threads: (i) July 26, 2017: netdev thread "TCP fast retransmit issues" (ii) July 26, 2017: netdev thread: "[PATCH V2 net-next] TLP: Don't reschedule PTO when there's one outstanding TLP retransmission" The basic problem is that incoming TCP packets that did not indicate forward progress could cause the xmit timer (TLP or RTO) to be rearmed and pushed back in time. In certain corner cases this could result in the following problems noted in these threads: - Repeated ACKs coming in with bogus SACKs corrupted by middleboxes could cause TCP to repeatedly schedule TLPs forever. We kept sending TLPs after every ~200ms, which elicited bogus SACKs, which caused more TLPs, ad infinitum; we never fired an RTO to fill in the holes. - Incoming data segments could, in some cases, cause us to reschedule our RTO or TLP timer further out in time, for no good reason. This could cause repeated inbound data to result in stalls in outbound data, in the presence of packet loss. This commit fixes these bugs by changing the TLP and RTO ACK processing to: (a) Only reschedule the xmit timer once per ACK. (b) Only reschedule the xmit timer if tcp_clean_rtx_queue() deems the ACK indicates sufficient forward progress (a packet was cumulatively ACKed, or we got a SACK for a packet that was sent before the most recent retransmit of the write queue head). This brings us back into closer compliance with the RFCs, since, as the comment for tcp_rearm_rto() notes, we should only restart the RTO timer after forward progress on the connection. Previously we were restarting the xmit timer even in these cases where there was no forward progress. As a side benefit, this commit simplifies and speeds up the TCP timer arming logic. We had been calling inet_csk_reset_xmit_timer() three times on normal ACKs that cumulatively acknowledged some data: 1) Once near the top of tcp_ack() to switch from TLP timer to RTO: if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); 2) Once in tcp_clean_rtx_queue(), to update the RTO: if (flag & FLAG_ACKED) { tcp_rearm_rto(sk); 3) Once in tcp_ack() after tcp_fastretrans_alert() to switch from RTO to TLP: if (icsk->icsk_pending == ICSK_TIME_RETRANS) tcp_schedule_loss_probe(sk); This commit, by only rescheduling the xmit timer once per ACK, simplifies the code and reduces CPU overhead. This commit was tested in an A/B test with Google web server traffic. SNMP stats and request latency metrics were within noise levels, substantiating that for normal web traffic patterns this is a rare issue. This commit was also tested with packetdrill tests to verify that it fixes the timer behavior in the corner cases discussed in the netdev threads mentioned above. This patch is a bug fix patch intended to be queued for -stable relases. Fixes: 6ba8a3b19e76 ("tcp: Tail loss probe (TLP)") Reported-by: Klavs Klavsen <kl@vsen.dk> Reported-by: Mao Wenan <maowenan@huawei.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Nandita Dukkipati <nanditad@google.com> Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c25
1 files changed, 16 insertions, 9 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b9ba8d55d8b8..53de1424c13c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -107,6 +107,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
107#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ 107#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
108#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ 108#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
109#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ 109#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
110#define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */
110#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ 111#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
111#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ 112#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
112#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ 113#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */
@@ -3016,6 +3017,13 @@ void tcp_rearm_rto(struct sock *sk)
3016 } 3017 }
3017} 3018}
3018 3019
3020/* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */
3021static void tcp_set_xmit_timer(struct sock *sk)
3022{
3023 if (!tcp_schedule_loss_probe(sk))
3024 tcp_rearm_rto(sk);
3025}
3026
3019/* If we get here, the whole TSO packet has not been acked. */ 3027/* If we get here, the whole TSO packet has not been acked. */
3020static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) 3028static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
3021{ 3029{
@@ -3177,7 +3185,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3177 ca_rtt_us, sack->rate); 3185 ca_rtt_us, sack->rate);
3178 3186
3179 if (flag & FLAG_ACKED) { 3187 if (flag & FLAG_ACKED) {
3180 tcp_rearm_rto(sk); 3188 flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */
3181 if (unlikely(icsk->icsk_mtup.probe_size && 3189 if (unlikely(icsk->icsk_mtup.probe_size &&
3182 !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) { 3190 !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
3183 tcp_mtup_probe_success(sk); 3191 tcp_mtup_probe_success(sk);
@@ -3205,7 +3213,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3205 * after when the head was last (re)transmitted. Otherwise the 3213 * after when the head was last (re)transmitted. Otherwise the
3206 * timeout may continue to extend in loss recovery. 3214 * timeout may continue to extend in loss recovery.
3207 */ 3215 */
3208 tcp_rearm_rto(sk); 3216 flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */
3209 } 3217 }
3210 3218
3211 if (icsk->icsk_ca_ops->pkts_acked) { 3219 if (icsk->icsk_ca_ops->pkts_acked) {
@@ -3577,9 +3585,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3577 if (after(ack, tp->snd_nxt)) 3585 if (after(ack, tp->snd_nxt))
3578 goto invalid_ack; 3586 goto invalid_ack;
3579 3587
3580 if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
3581 tcp_rearm_rto(sk);
3582
3583 if (after(ack, prior_snd_una)) { 3588 if (after(ack, prior_snd_una)) {
3584 flag |= FLAG_SND_UNA_ADVANCED; 3589 flag |= FLAG_SND_UNA_ADVANCED;
3585 icsk->icsk_retransmits = 0; 3590 icsk->icsk_retransmits = 0;
@@ -3644,18 +3649,20 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3644 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, 3649 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
3645 &sack_state); 3650 &sack_state);
3646 3651
3652 if (tp->tlp_high_seq)
3653 tcp_process_tlp_ack(sk, ack, flag);
3654 /* If needed, reset TLP/RTO timer; RACK may later override this. */
3655 if (flag & FLAG_SET_XMIT_TIMER)
3656 tcp_set_xmit_timer(sk);
3657
3647 if (tcp_ack_is_dubious(sk, flag)) { 3658 if (tcp_ack_is_dubious(sk, flag)) {
3648 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); 3659 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
3649 tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); 3660 tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
3650 } 3661 }
3651 if (tp->tlp_high_seq)
3652 tcp_process_tlp_ack(sk, ack, flag);
3653 3662
3654 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) 3663 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
3655 sk_dst_confirm(sk); 3664 sk_dst_confirm(sk);
3656 3665
3657 if (icsk->icsk_pending == ICSK_TIME_RETRANS)
3658 tcp_schedule_loss_probe(sk);
3659 delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ 3666 delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
3660 lost = tp->lost - lost; /* freshly marked lost */ 3667 lost = tp->lost - lost; /* freshly marked lost */
3661 tcp_rate_gen(sk, delivered, lost, sack_state.rate); 3668 tcp_rate_gen(sk, delivered, lost, sack_state.rate);