tcp: adjust window probe timers to safer values

With the advent of small rto timers in datacenter TCP, (ip route ... rto_min x), the following can happen : 1) Qdisc is full, transmit fails. TCP sets a timer based on icsk_rto to retry the transmit, without exponential backoff. With low icsk_rto, and lot of sockets, all cpus are servicing timer interrupts like crazy. Intent of the code was to retry with a timer between 200 (TCP_RTO_MIN) and 500ms (TCP_RESOURCE_PROBE_INTERVAL) 2) Receivers can send zero windows if they don't drain their receive queue. TCP sends zero window probes, based on icsk_rto current value, with exponential backoff. With /proc/sys/net/ipv4/tcp_retries2 being 15 (or even smaller in some cases), sender can abort in less than one or two minutes ! If receiver stops the sender, it obviously doesn't care of very tight rto. Probability of dropping the ACK reopening the window is not worth the risk. Lets change the base timer to be at least 200ms (TCP_RTO_MIN) for these events (but not normal RTO based retransmits) A followup patch adds a new SNMP counter, as it would have helped a lot diagnosing this issue. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Yuchung Cheng <ycheng@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Eric Dumazet <edumazet@google.com> 2015-05-06 17:26:24 -0400
committer: David S. Miller <davem@davemloft.net> 2015-05-09 16:42:32 -0400
commit: 21c8fe9915276d923f8c1e43434fd6d37a3b9aef (patch)
tree: 3bf845d67545bbb3b411b3221cffb37e3ed9ce57 /include/net/tcp.h
parent: b063bc5ea77b1c1c0e7798f641f53504d0f64bf8 (diff)
1 files changed, 22 insertions, 5 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6d204f3f9df8..7a2248a35b13 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1043,14 +1043,31 @@ static inline bool tcp_is_cwnd_limited(const struct sock *sk)
        return tp->is_cwnd_limited;
 }
-static inline void tcp_check_probe_timer(struct sock *sk)
+/* Something is really bad, we could not queue an additional packet,
+ * because qdisc is full or receiver sent a 0 window.
+ * We do not want to add fuel to the fire, or abort too early,
+ * so make sure the timer we arm now is at least 200ms in the future,
+ * regardless of current icsk_rto value (as it could be ~2ms)
+ */
+static inline unsigned long tcp_probe0_base(const struct sock *sk)
 {
-        const struct tcp_sock *tp = tcp_sk(sk);
+        return max_t(unsigned long, inet_csk(sk)->icsk_rto, TCP_RTO_MIN);
-        const struct inet_connection_sock *icsk = inet_csk(sk);
+}
-        if (!tp->packets_out && !icsk->icsk_pending)
+/* Variant of inet_csk_rto_backoff() used for zero window probes */
+static inline unsigned long tcp_probe0_when(const struct sock *sk,
+                                            unsigned long max_when)
+{
+        u64 when = (u64)tcp_probe0_base(sk) << inet_csk(sk)->icsk_backoff;
+        return (unsigned long)min_t(u64, when, max_when);
+}
+static inline void tcp_check_probe_timer(struct sock *sk)
+{
+        if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending)
                inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
-                                          icsk->icsk_rto, TCP_RTO_MAX);
+                                          tcp_probe0_base(sk), TCP_RTO_MAX);
 }
 static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq)
author	Eric Dumazet <edumazet@google.com>	2015-05-06 17:26:24 -0400
committer	David S. Miller <davem@davemloft.net>	2015-05-09 16:42:32 -0400
commit	21c8fe9915276d923f8c1e43434fd6d37a3b9aef (patch)
tree	3bf845d67545bbb3b411b3221cffb37e3ed9ce57 /include/net/tcp.h
parent	b063bc5ea77b1c1c0e7798f641f53504d0f64bf8 (diff)

diff --git a/include/net/tcp.h b/include/net/tcp.h index 6d204f3f9df8..7a2248a35b13 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h
@@ -1043,14 +1043,31 @@ static inline bool tcp_is_cwnd_limited(const struct sock *sk)
1043	return tp->is_cwnd_limited;	1043	return tp->is_cwnd_limited;
1044	}	1044	}
1045		1045
1046	static inline void tcp_check_probe_timer(struct sock *sk)	1046	/* Something is really bad, we could not queue an additional packet,
		1047	* because qdisc is full or receiver sent a 0 window.
		1048	* We do not want to add fuel to the fire, or abort too early,
		1049	* so make sure the timer we arm now is at least 200ms in the future,
		1050	* regardless of current icsk_rto value (as it could be ~2ms)
		1051	*/
		1052	static inline unsigned long tcp_probe0_base(const struct sock *sk)
1047	{	1053	{
1048	const struct tcp_sock *tp = tcp_sk(sk);	1054	return max_t(unsigned long, inet_csk(sk)->icsk_rto, TCP_RTO_MIN);
1049	const struct inet_connection_sock *icsk = inet_csk(sk);	1055	}
1050		1056
1051	if (!tp->packets_out && !icsk->icsk_pending)	1057	/* Variant of inet_csk_rto_backoff() used for zero window probes */
		1058	static inline unsigned long tcp_probe0_when(const struct sock *sk,
		1059	unsigned long max_when)
		1060	{
		1061	u64 when = (u64)tcp_probe0_base(sk) << inet_csk(sk)->icsk_backoff;
		1062
		1063	return (unsigned long)min_t(u64, when, max_when);
		1064	}
		1065
		1066	static inline void tcp_check_probe_timer(struct sock *sk)
		1067	{
		1068	if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending)
1052	inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,	1069	inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
1053	icsk->icsk_rto, TCP_RTO_MAX);	1070	tcp_probe0_base(sk), TCP_RTO_MAX);
1054	}	1071	}
1055		1072
1056	static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq)	1073	static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq)