tcp: abort orphan sockets stalling on zero window probes

Currently we have two different policies for orphan sockets that repeatedly stall on zero window ACKs. If a socket gets a zero window ACK when it is transmitting data, the RTO is used to probe the window. The socket is aborted after roughly tcp_orphan_retries() retries (as in tcp_write_timeout()). But if the socket was idle when it received the zero window ACK, and later wants to send more data, we use the probe timer to probe the window. If the receiver always returns zero window ACKs, icsk_probes keeps getting reset in tcp_ack() and the orphan socket can stall forever until the system reaches the orphan limit (as commented in tcp_probe_timer()). This opens up a simple attack to create lots of hanging orphan sockets to burn the memory and the CPU, as demonstrated in the recent netdev post "TCP connection will hang in FIN_WAIT1 after closing if zero window is advertised." http://www.spinics.net/lists/netdev/msg296539.html This patch follows the design in RTO-based probe: we abort an orphan socket stalling on zero window when the probe timer reaches both the maximum backoff and the maximum RTO. For example, an 100ms RTT connection will timeout after roughly 153 seconds (0.3 + 0.6 + .... + 76.8) if the receiver keeps the window shut. If the orphan socket passes this check, but the system already has too many orphans (as in tcp_out_of_resources()), we still abort it but we'll also send an RST packet as the connection may still be active. In addition, we change TCP_USER_TIMEOUT to cover (life or dead) sockets stalled on zero-window probes. This changes the semantics of TCP_USER_TIMEOUT slightly because it previously only applies when the socket has pending transmission. Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Reported-by: Andrey Dmitrov <andrey.dmitrov@oktetlabs.ru> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Yuchung Cheng <ycheng@google.com> 2014-09-29 16:20:38 -0400
committer: David S. Miller <davem@davemloft.net> 2014-10-01 16:27:52 -0400
commit: b248230c34970a6c1c17c591d63b464e8d2cfc33 (patch)
tree: 1b87913e6b3dc3574cbe78f7d1736ae4074ebf93 /net
parent: cb57659a15c6c0576493cc8a10474ce7ffd44eb3 (diff)
2 files changed, 22 insertions, 21 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5c170340f684..26a6f113f00c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2693,7 +2693,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
                break;
 #endif
        case TCP_USER_TIMEOUT:
-                /* Cap the max timeout in ms TCP will retry/retrans
+                /* Cap the max time in ms TCP will retry or probe the window
                 * before giving up and aborting (ETIMEDOUT) a connection.
                 */
                if (val < 0)
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b24360f6e293..9b21ae8b2e31 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -52,7 +52,7 @@ static void tcp_write_err(struct sock *sk)
 *    limit.
 * 2. If we have strong memory pressure.
 */
-static int tcp_out_of_resources(struct sock *sk, int do_reset)
+static int tcp_out_of_resources(struct sock *sk, bool do_reset)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        int shift = 0;
@@ -72,7 +72,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset)
                if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
                    /*  2. Window is closed. */
                    (!tp->snd_wnd && !tp->packets_out))
-                        do_reset = 1;
+                        do_reset = true;
                if (do_reset)
                        tcp_send_active_reset(sk, GFP_ATOMIC);
                tcp_done(sk);
@@ -270,40 +270,41 @@ static void tcp_probe_timer(struct sock *sk)
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        int max_probes;
+        u32 start_ts;
        if (tp->packets_out || !tcp_send_head(sk)) {
                icsk->icsk_probes_out = 0;
                return;
        }
-        /* *WARNING* RFC 1122 forbids this
+        /* RFC 1122 4.2.2.17 requires the sender to stay open indefinitely as
-         *
+         * long as the receiver continues to respond probes. We support this by
-         * It doesn't AFAIK, because we kill the retransmit timer -AK
+         * default and reset icsk_probes_out with incoming ACKs. But if the
-         *
+         * socket is orphaned or the user specifies TCP_USER_TIMEOUT, we
-         * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
+         * kill the socket when the retry count and the time exceeds the
-         * this behaviour in Solaris down as a bug fix. [AC]
+         * corresponding system limit. We also implement similar policy when
-         *
+         * we use RTO to probe window in tcp_retransmit_timer().
-         * Let me to explain. icsk_probes_out is zeroed by incoming ACKs
-         * even if they advertise zero window. Hence, connection is killed only
-         * if we received no ACKs for normal connection timeout. It is not killed
-         * only because window stays zero for some time, window may be zero
-         * until armageddon and even later. We are in full accordance
-         * with RFCs, only probe timer combines both retransmission timeout
-         * and probe timeout in one bottle.                             --ANK
         */
-        max_probes = sysctl_tcp_retries2;
+        start_ts = tcp_skb_timestamp(tcp_send_head(sk));
+        if (!start_ts)
+                skb_mstamp_get(&tcp_send_head(sk)->skb_mstamp);
+        else if (icsk->icsk_user_timeout &&
+                 (s32)(tcp_time_stamp - start_ts) > icsk->icsk_user_timeout)
+                goto abort;
+        max_probes = sysctl_tcp_retries2;
        if (sock_flag(sk, SOCK_DEAD)) {
                const int alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
                max_probes = tcp_orphan_retries(sk, alive);
+                if (!alive && icsk->icsk_backoff >= max_probes)
-                if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes))
+                        goto abort;
+                if (tcp_out_of_resources(sk, true))
                        return;
        }
        if (icsk->icsk_probes_out > max_probes) {
-                tcp_write_err(sk);
+abort:          tcp_write_err(sk);
        } else {
                /* Only send another probe if we didn't close things up. */
                tcp_send_probe0(sk);
author	Yuchung Cheng <ycheng@google.com>	2014-09-29 16:20:38 -0400
committer	David S. Miller <davem@davemloft.net>	2014-10-01 16:27:52 -0400
commit	b248230c34970a6c1c17c591d63b464e8d2cfc33 (patch)
tree	1b87913e6b3dc3574cbe78f7d1736ae4074ebf93 /net
parent	cb57659a15c6c0576493cc8a10474ce7ffd44eb3 (diff)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5c170340f684..26a6f113f00c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c
@@ -2693,7 +2693,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2693	break;	2693	break;
2694	#endif	2694	#endif
2695	case TCP_USER_TIMEOUT:	2695	case TCP_USER_TIMEOUT:
2696	/* Cap the max timeout in ms TCP will retry/retrans	2696	/* Cap the max time in ms TCP will retry or probe the window
2697	* before giving up and aborting (ETIMEDOUT) a connection.	2697	* before giving up and aborting (ETIMEDOUT) a connection.
2698	*/	2698	*/
2699	if (val < 0)	2699	if (val < 0)


diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index b24360f6e293..9b21ae8b2e31 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c
@@ -52,7 +52,7 @@ static void tcp_write_err(struct sock *sk)
52	* limit.	52	* limit.
53	* 2. If we have strong memory pressure.	53	* 2. If we have strong memory pressure.
54	*/	54	*/
55	static int tcp_out_of_resources(struct sock *sk, int do_reset)	55	static int tcp_out_of_resources(struct sock *sk, bool do_reset)
56	{	56	{
57	struct tcp_sock *tp = tcp_sk(sk);	57	struct tcp_sock *tp = tcp_sk(sk);
58	int shift = 0;	58	int shift = 0;
@@ -72,7 +72,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset)
72	if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN \|\|	72	if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN \|\|
73	/* 2. Window is closed. */	73	/* 2. Window is closed. */
74	(!tp->snd_wnd && !tp->packets_out))	74	(!tp->snd_wnd && !tp->packets_out))
75	do_reset = 1;	75	do_reset = true;
76	if (do_reset)	76	if (do_reset)
77	tcp_send_active_reset(sk, GFP_ATOMIC);	77	tcp_send_active_reset(sk, GFP_ATOMIC);
78	tcp_done(sk);	78	tcp_done(sk);
@@ -270,40 +270,41 @@ static void tcp_probe_timer(struct sock *sk)
270	struct inet_connection_sock *icsk = inet_csk(sk);	270	struct inet_connection_sock *icsk = inet_csk(sk);
271	struct tcp_sock *tp = tcp_sk(sk);	271	struct tcp_sock *tp = tcp_sk(sk);
272	int max_probes;	272	int max_probes;
		273	u32 start_ts;
273		274
274	if (tp->packets_out \|\| !tcp_send_head(sk)) {	275	if (tp->packets_out \|\| !tcp_send_head(sk)) {
275	icsk->icsk_probes_out = 0;	276	icsk->icsk_probes_out = 0;
276	return;	277	return;
277	}	278	}
278		279
279	/* WARNING RFC 1122 forbids this	280	/* RFC 1122 4.2.2.17 requires the sender to stay open indefinitely as
280	*	281	* long as the receiver continues to respond probes. We support this by
281	* It doesn't AFAIK, because we kill the retransmit timer -AK	282	* default and reset icsk_probes_out with incoming ACKs. But if the
282	*	283	* socket is orphaned or the user specifies TCP_USER_TIMEOUT, we
283	* FIXME: We ought not to do it, Solaris 2.5 actually has fixing	284	* kill the socket when the retry count and the time exceeds the
284	* this behaviour in Solaris down as a bug fix. [AC]	285	* corresponding system limit. We also implement similar policy when
285	*	286	* we use RTO to probe window in tcp_retransmit_timer().
286	* Let me to explain. icsk_probes_out is zeroed by incoming ACKs
287	* even if they advertise zero window. Hence, connection is killed only
288	* if we received no ACKs for normal connection timeout. It is not killed
289	* only because window stays zero for some time, window may be zero
290	* until armageddon and even later. We are in full accordance
291	* with RFCs, only probe timer combines both retransmission timeout
292	* and probe timeout in one bottle. --ANK
293	*/	287	*/
294	max_probes = sysctl_tcp_retries2;	288	start_ts = tcp_skb_timestamp(tcp_send_head(sk));
		289	if (!start_ts)
		290	skb_mstamp_get(&tcp_send_head(sk)->skb_mstamp);
		291	else if (icsk->icsk_user_timeout &&
		292	(s32)(tcp_time_stamp - start_ts) > icsk->icsk_user_timeout)
		293	goto abort;
295		294
		295	max_probes = sysctl_tcp_retries2;
296	if (sock_flag(sk, SOCK_DEAD)) {	296	if (sock_flag(sk, SOCK_DEAD)) {
297	const int alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;	297	const int alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
298		298
299	max_probes = tcp_orphan_retries(sk, alive);	299	max_probes = tcp_orphan_retries(sk, alive);
300		300	if (!alive && icsk->icsk_backoff >= max_probes)
301	if (tcp_out_of_resources(sk, alive \|\| icsk->icsk_probes_out <= max_probes))	301	goto abort;
		302	if (tcp_out_of_resources(sk, true))
302	return;	303	return;
303	}	304	}
304		305
305	if (icsk->icsk_probes_out > max_probes) {	306	if (icsk->icsk_probes_out > max_probes) {
306	tcp_write_err(sk);	307	abort: tcp_write_err(sk);
307	} else {	308	} else {
308	/* Only send another probe if we didn't close things up. */	309	/* Only send another probe if we didn't close things up. */
309	tcp_send_probe0(sk);	310	tcp_send_probe0(sk);