aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorYuchung Cheng <ycheng@google.com>2014-09-29 16:20:38 -0400
committerDavid S. Miller <davem@davemloft.net>2014-10-01 16:27:52 -0400
commitb248230c34970a6c1c17c591d63b464e8d2cfc33 (patch)
tree1b87913e6b3dc3574cbe78f7d1736ae4074ebf93 /net
parentcb57659a15c6c0576493cc8a10474ce7ffd44eb3 (diff)
tcp: abort orphan sockets stalling on zero window probes
Currently we have two different policies for orphan sockets that repeatedly stall on zero window ACKs. If a socket gets a zero window ACK when it is transmitting data, the RTO is used to probe the window. The socket is aborted after roughly tcp_orphan_retries() retries (as in tcp_write_timeout()). But if the socket was idle when it received the zero window ACK, and later wants to send more data, we use the probe timer to probe the window. If the receiver always returns zero window ACKs, icsk_probes keeps getting reset in tcp_ack() and the orphan socket can stall forever until the system reaches the orphan limit (as commented in tcp_probe_timer()). This opens up a simple attack to create lots of hanging orphan sockets to burn the memory and the CPU, as demonstrated in the recent netdev post "TCP connection will hang in FIN_WAIT1 after closing if zero window is advertised." http://www.spinics.net/lists/netdev/msg296539.html This patch follows the design in RTO-based probe: we abort an orphan socket stalling on zero window when the probe timer reaches both the maximum backoff and the maximum RTO. For example, an 100ms RTT connection will timeout after roughly 153 seconds (0.3 + 0.6 + .... + 76.8) if the receiver keeps the window shut. If the orphan socket passes this check, but the system already has too many orphans (as in tcp_out_of_resources()), we still abort it but we'll also send an RST packet as the connection may still be active. In addition, we change TCP_USER_TIMEOUT to cover (life or dead) sockets stalled on zero-window probes. This changes the semantics of TCP_USER_TIMEOUT slightly because it previously only applies when the socket has pending transmission. Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Reported-by: Andrey Dmitrov <andrey.dmitrov@oktetlabs.ru> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/ipv4/tcp.c2
-rw-r--r--net/ipv4/tcp_timer.c41
2 files changed, 22 insertions, 21 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5c170340f684..26a6f113f00c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2693,7 +2693,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2693 break; 2693 break;
2694#endif 2694#endif
2695 case TCP_USER_TIMEOUT: 2695 case TCP_USER_TIMEOUT:
2696 /* Cap the max timeout in ms TCP will retry/retrans 2696 /* Cap the max time in ms TCP will retry or probe the window
2697 * before giving up and aborting (ETIMEDOUT) a connection. 2697 * before giving up and aborting (ETIMEDOUT) a connection.
2698 */ 2698 */
2699 if (val < 0) 2699 if (val < 0)
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b24360f6e293..9b21ae8b2e31 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -52,7 +52,7 @@ static void tcp_write_err(struct sock *sk)
52 * limit. 52 * limit.
53 * 2. If we have strong memory pressure. 53 * 2. If we have strong memory pressure.
54 */ 54 */
55static int tcp_out_of_resources(struct sock *sk, int do_reset) 55static int tcp_out_of_resources(struct sock *sk, bool do_reset)
56{ 56{
57 struct tcp_sock *tp = tcp_sk(sk); 57 struct tcp_sock *tp = tcp_sk(sk);
58 int shift = 0; 58 int shift = 0;
@@ -72,7 +72,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset)
72 if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN || 72 if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
73 /* 2. Window is closed. */ 73 /* 2. Window is closed. */
74 (!tp->snd_wnd && !tp->packets_out)) 74 (!tp->snd_wnd && !tp->packets_out))
75 do_reset = 1; 75 do_reset = true;
76 if (do_reset) 76 if (do_reset)
77 tcp_send_active_reset(sk, GFP_ATOMIC); 77 tcp_send_active_reset(sk, GFP_ATOMIC);
78 tcp_done(sk); 78 tcp_done(sk);
@@ -270,40 +270,41 @@ static void tcp_probe_timer(struct sock *sk)
270 struct inet_connection_sock *icsk = inet_csk(sk); 270 struct inet_connection_sock *icsk = inet_csk(sk);
271 struct tcp_sock *tp = tcp_sk(sk); 271 struct tcp_sock *tp = tcp_sk(sk);
272 int max_probes; 272 int max_probes;
273 u32 start_ts;
273 274
274 if (tp->packets_out || !tcp_send_head(sk)) { 275 if (tp->packets_out || !tcp_send_head(sk)) {
275 icsk->icsk_probes_out = 0; 276 icsk->icsk_probes_out = 0;
276 return; 277 return;
277 } 278 }
278 279
279 /* *WARNING* RFC 1122 forbids this 280 /* RFC 1122 4.2.2.17 requires the sender to stay open indefinitely as
280 * 281 * long as the receiver continues to respond probes. We support this by
281 * It doesn't AFAIK, because we kill the retransmit timer -AK 282 * default and reset icsk_probes_out with incoming ACKs. But if the
282 * 283 * socket is orphaned or the user specifies TCP_USER_TIMEOUT, we
283 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing 284 * kill the socket when the retry count and the time exceeds the
284 * this behaviour in Solaris down as a bug fix. [AC] 285 * corresponding system limit. We also implement similar policy when
285 * 286 * we use RTO to probe window in tcp_retransmit_timer().
286 * Let me to explain. icsk_probes_out is zeroed by incoming ACKs
287 * even if they advertise zero window. Hence, connection is killed only
288 * if we received no ACKs for normal connection timeout. It is not killed
289 * only because window stays zero for some time, window may be zero
290 * until armageddon and even later. We are in full accordance
291 * with RFCs, only probe timer combines both retransmission timeout
292 * and probe timeout in one bottle. --ANK
293 */ 287 */
294 max_probes = sysctl_tcp_retries2; 288 start_ts = tcp_skb_timestamp(tcp_send_head(sk));
289 if (!start_ts)
290 skb_mstamp_get(&tcp_send_head(sk)->skb_mstamp);
291 else if (icsk->icsk_user_timeout &&
292 (s32)(tcp_time_stamp - start_ts) > icsk->icsk_user_timeout)
293 goto abort;
295 294
295 max_probes = sysctl_tcp_retries2;
296 if (sock_flag(sk, SOCK_DEAD)) { 296 if (sock_flag(sk, SOCK_DEAD)) {
297 const int alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; 297 const int alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
298 298
299 max_probes = tcp_orphan_retries(sk, alive); 299 max_probes = tcp_orphan_retries(sk, alive);
300 300 if (!alive && icsk->icsk_backoff >= max_probes)
301 if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes)) 301 goto abort;
302 if (tcp_out_of_resources(sk, true))
302 return; 303 return;
303 } 304 }
304 305
305 if (icsk->icsk_probes_out > max_probes) { 306 if (icsk->icsk_probes_out > max_probes) {
306 tcp_write_err(sk); 307abort: tcp_write_err(sk);
307 } else { 308 } else {
308 /* Only send another probe if we didn't close things up. */ 309 /* Only send another probe if we didn't close things up. */
309 tcp_send_probe0(sk); 310 tcp_send_probe0(sk);