diff options
author | Jason Baron <jbaron@akamai.com> | 2017-01-17 13:37:19 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2017-01-17 15:51:55 -0500 |
commit | 0e40f4c9593ba2c7c30150ed669da97bd581c0cd (patch) | |
tree | 274f40b12fadf479f3579290c406cbf93c4857b6 /net/ipv4/tcp_input.c | |
parent | a870a97757dd4f165f4f7bb749350bee7df31716 (diff) |
tcp: accept RST for rcv_nxt - 1 after receiving a FIN
Using a Mac OSX box as a client connecting to a Linux server, we have found
that when certain applications (such as 'ab'), are abruptly terminated
(via ^C), a FIN is sent followed by a RST packet on tcp connections. The
FIN is accepted by the Linux stack but the RST is sent with the same
sequence number as the FIN, and Linux responds with a challenge ACK per
RFC 5961. The OSX client then sometimes (they are rate-limited) does not
reply with any RST as would be expected on a closed socket.
This results in sockets accumulating on the Linux server left mostly in
the CLOSE_WAIT state, although LAST_ACK and CLOSING are also possible.
This sequence of events can tie up a lot of resources on the Linux server
since there may be a lot of data in write buffers at the time of the RST.
Accepting a RST equal to rcv_nxt - 1, after we have already successfully
processed a FIN, has made a significant difference for us in practice, by
freeing up unneeded resources in a more expedient fashion.
A packetdrill test demonstrating the behavior:
// testing mac osx rst behavior
// Establish a connection
0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
0.000 bind(3, ..., ...) = 0
0.000 listen(3, 1) = 0
0.100 < S 0:0(0) win 32768 <mss 1460,nop,wscale 10>
0.100 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 5>
0.200 < . 1:1(0) ack 1 win 32768
0.200 accept(3, ..., ...) = 4
// Client closes the connection
0.300 < F. 1:1(0) ack 1 win 32768
// now send rst with same sequence
0.300 < R. 1:1(0) ack 1 win 32768
// make sure we are in TCP_CLOSE
0.400 %{
assert tcpi_state == 7
}%
Signed-off-by: Jason Baron <jbaron@akamai.com>
Cc: Eric Dumazet <edumazet@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r-- | net/ipv4/tcp_input.c | 28 |
1 files changed, 25 insertions, 3 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 1a34e9278c07..bfa165cc455a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -5199,6 +5199,23 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) | |||
5199 | return err; | 5199 | return err; |
5200 | } | 5200 | } |
5201 | 5201 | ||
5202 | /* Accept RST for rcv_nxt - 1 after a FIN. | ||
5203 | * When tcp connections are abruptly terminated from Mac OSX (via ^C), a | ||
5204 | * FIN is sent followed by a RST packet. The RST is sent with the same | ||
5205 | * sequence number as the FIN, and thus according to RFC 5961 a challenge | ||
5206 | * ACK should be sent. However, Mac OSX rate limits replies to challenge | ||
5207 | * ACKs on the closed socket. In addition middleboxes can drop either the | ||
5208 | * challenge ACK or a subsequent RST. | ||
5209 | */ | ||
5210 | static bool tcp_reset_check(const struct sock *sk, const struct sk_buff *skb) | ||
5211 | { | ||
5212 | struct tcp_sock *tp = tcp_sk(sk); | ||
5213 | |||
5214 | return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) && | ||
5215 | (1 << sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK | | ||
5216 | TCPF_CLOSING)); | ||
5217 | } | ||
5218 | |||
5202 | /* Does PAWS and seqno based validation of an incoming segment, flags will | 5219 | /* Does PAWS and seqno based validation of an incoming segment, flags will |
5203 | * play significant role here. | 5220 | * play significant role here. |
5204 | */ | 5221 | */ |
@@ -5237,20 +5254,25 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, | |||
5237 | LINUX_MIB_TCPACKSKIPPEDSEQ, | 5254 | LINUX_MIB_TCPACKSKIPPEDSEQ, |
5238 | &tp->last_oow_ack_time)) | 5255 | &tp->last_oow_ack_time)) |
5239 | tcp_send_dupack(sk, skb); | 5256 | tcp_send_dupack(sk, skb); |
5257 | } else if (tcp_reset_check(sk, skb)) { | ||
5258 | tcp_reset(sk); | ||
5240 | } | 5259 | } |
5241 | goto discard; | 5260 | goto discard; |
5242 | } | 5261 | } |
5243 | 5262 | ||
5244 | /* Step 2: check RST bit */ | 5263 | /* Step 2: check RST bit */ |
5245 | if (th->rst) { | 5264 | if (th->rst) { |
5246 | /* RFC 5961 3.2 (extend to match against SACK too if available): | 5265 | /* RFC 5961 3.2 (extend to match against (RCV.NXT - 1) after a |
5247 | * If seq num matches RCV.NXT or the right-most SACK block, | 5266 | * FIN and SACK too if available): |
5267 | * If seq num matches RCV.NXT or (RCV.NXT - 1) after a FIN, or | ||
5268 | * the right-most SACK block, | ||
5248 | * then | 5269 | * then |
5249 | * RESET the connection | 5270 | * RESET the connection |
5250 | * else | 5271 | * else |
5251 | * Send a challenge ACK | 5272 | * Send a challenge ACK |
5252 | */ | 5273 | */ |
5253 | if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { | 5274 | if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt || |
5275 | tcp_reset_check(sk, skb)) { | ||
5254 | rst_seq_match = true; | 5276 | rst_seq_match = true; |
5255 | } else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) { | 5277 | } else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) { |
5256 | struct tcp_sack_block *sp = &tp->selective_acks[0]; | 5278 | struct tcp_sack_block *sp = &tp->selective_acks[0]; |