diff options
author | Jerry Chu <hkchu@google.com> | 2012-08-31 08:29:12 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2012-08-31 20:02:19 -0400 |
commit | 8336886f786fdacbc19b719c1f7ea91eb70706d4 (patch) | |
tree | c1fa912f7583ce0ffcb5ae673802da4a7dfb3b19 /net/ipv4 | |
parent | 1046716368979dee857a2b8a91c4a8833f21b9cb (diff) |
tcp: TCP Fast Open Server - support TFO listeners
This patch builds on top of the previous patch to add the support
for TFO listeners. This includes -
1. allocating, properly initializing, and managing the per listener
fastopen_queue structure when TFO is enabled
2. changes to the inet_csk_accept code to support TFO. E.g., the
request_sock can no longer be freed upon accept(), not until 3WHS
finishes
3. allowing a TCP_SYN_RECV socket to properly poll() and sendmsg()
if it's a TFO socket
4. properly closing a TFO listener, and a TFO socket before 3WHS
finishes
5. supporting TCP_FASTOPEN socket option
6. modifying tcp_check_req() to use to check a TFO socket as well
as request_sock
7. supporting TCP's TFO cookie option
8. adding a new SYN-ACK retransmit handler to use the timer directly
off the TFO socket rather than the listener socket. Note that TFO
server side will not retransmit anything other than SYN-ACK until
the 3WHS is completed.
The patch also contains an important function
"reqsk_fastopen_remove()" to manage the somewhat complex relation
between a listener, its request_sock, and the corresponding child
socket. See the comment above the function for the detail.
Signed-off-by: H.K. Jerry Chu <hkchu@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/af_inet.c | 28 | ||||
-rw-r--r-- | net/ipv4/inet_connection_sock.c | 57 | ||||
-rw-r--r-- | net/ipv4/syncookies.c | 1 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 49 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 4 | ||||
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 61 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 21 | ||||
-rw-r--r-- | net/ipv4/tcp_timer.c | 39 |
8 files changed, 227 insertions, 33 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 6681ccf5c3ee..4f70ef0b946d 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c | |||
@@ -149,6 +149,11 @@ void inet_sock_destruct(struct sock *sk) | |||
149 | pr_err("Attempt to release alive inet socket %p\n", sk); | 149 | pr_err("Attempt to release alive inet socket %p\n", sk); |
150 | return; | 150 | return; |
151 | } | 151 | } |
152 | if (sk->sk_type == SOCK_STREAM) { | ||
153 | struct fastopen_queue *fastopenq = | ||
154 | inet_csk(sk)->icsk_accept_queue.fastopenq; | ||
155 | kfree(fastopenq); | ||
156 | } | ||
152 | 157 | ||
153 | WARN_ON(atomic_read(&sk->sk_rmem_alloc)); | 158 | WARN_ON(atomic_read(&sk->sk_rmem_alloc)); |
154 | WARN_ON(atomic_read(&sk->sk_wmem_alloc)); | 159 | WARN_ON(atomic_read(&sk->sk_wmem_alloc)); |
@@ -212,6 +217,26 @@ int inet_listen(struct socket *sock, int backlog) | |||
212 | * we can only allow the backlog to be adjusted. | 217 | * we can only allow the backlog to be adjusted. |
213 | */ | 218 | */ |
214 | if (old_state != TCP_LISTEN) { | 219 | if (old_state != TCP_LISTEN) { |
220 | /* Check special setups for testing purpose to enable TFO w/o | ||
221 | * requiring TCP_FASTOPEN sockopt. | ||
222 | * Note that only TCP sockets (SOCK_STREAM) will reach here. | ||
223 | * Also fastopenq may already been allocated because this | ||
224 | * socket was in TCP_LISTEN state previously but was | ||
225 | * shutdown() (rather than close()). | ||
226 | */ | ||
227 | if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 && | ||
228 | inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) { | ||
229 | if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0) | ||
230 | err = fastopen_init_queue(sk, backlog); | ||
231 | else if ((sysctl_tcp_fastopen & | ||
232 | TFO_SERVER_WO_SOCKOPT2) != 0) | ||
233 | err = fastopen_init_queue(sk, | ||
234 | ((uint)sysctl_tcp_fastopen) >> 16); | ||
235 | else | ||
236 | err = 0; | ||
237 | if (err) | ||
238 | goto out; | ||
239 | } | ||
215 | err = inet_csk_listen_start(sk, backlog); | 240 | err = inet_csk_listen_start(sk, backlog); |
216 | if (err) | 241 | if (err) |
217 | goto out; | 242 | goto out; |
@@ -701,7 +726,8 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags) | |||
701 | 726 | ||
702 | sock_rps_record_flow(sk2); | 727 | sock_rps_record_flow(sk2); |
703 | WARN_ON(!((1 << sk2->sk_state) & | 728 | WARN_ON(!((1 << sk2->sk_state) & |
704 | (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE))); | 729 | (TCPF_ESTABLISHED | TCPF_SYN_RECV | |
730 | TCPF_CLOSE_WAIT | TCPF_CLOSE))); | ||
705 | 731 | ||
706 | sock_graft(sk2, newsock); | 732 | sock_graft(sk2, newsock); |
707 | 733 | ||
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 7f75f21d7b83..8464b79c493f 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c | |||
@@ -283,7 +283,9 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) | |||
283 | struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) | 283 | struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) |
284 | { | 284 | { |
285 | struct inet_connection_sock *icsk = inet_csk(sk); | 285 | struct inet_connection_sock *icsk = inet_csk(sk); |
286 | struct request_sock_queue *queue = &icsk->icsk_accept_queue; | ||
286 | struct sock *newsk; | 287 | struct sock *newsk; |
288 | struct request_sock *req; | ||
287 | int error; | 289 | int error; |
288 | 290 | ||
289 | lock_sock(sk); | 291 | lock_sock(sk); |
@@ -296,7 +298,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) | |||
296 | goto out_err; | 298 | goto out_err; |
297 | 299 | ||
298 | /* Find already established connection */ | 300 | /* Find already established connection */ |
299 | if (reqsk_queue_empty(&icsk->icsk_accept_queue)) { | 301 | if (reqsk_queue_empty(queue)) { |
300 | long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); | 302 | long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); |
301 | 303 | ||
302 | /* If this is a non blocking socket don't sleep */ | 304 | /* If this is a non blocking socket don't sleep */ |
@@ -308,14 +310,32 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) | |||
308 | if (error) | 310 | if (error) |
309 | goto out_err; | 311 | goto out_err; |
310 | } | 312 | } |
311 | 313 | req = reqsk_queue_remove(queue); | |
312 | newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk); | 314 | newsk = req->sk; |
313 | WARN_ON(newsk->sk_state == TCP_SYN_RECV); | 315 | |
316 | sk_acceptq_removed(sk); | ||
317 | if (sk->sk_type == SOCK_STREAM && queue->fastopenq != NULL) { | ||
318 | spin_lock_bh(&queue->fastopenq->lock); | ||
319 | if (tcp_rsk(req)->listener) { | ||
320 | /* We are still waiting for the final ACK from 3WHS | ||
321 | * so can't free req now. Instead, we set req->sk to | ||
322 | * NULL to signify that the child socket is taken | ||
323 | * so reqsk_fastopen_remove() will free the req | ||
324 | * when 3WHS finishes (or is aborted). | ||
325 | */ | ||
326 | req->sk = NULL; | ||
327 | req = NULL; | ||
328 | } | ||
329 | spin_unlock_bh(&queue->fastopenq->lock); | ||
330 | } | ||
314 | out: | 331 | out: |
315 | release_sock(sk); | 332 | release_sock(sk); |
333 | if (req) | ||
334 | __reqsk_free(req); | ||
316 | return newsk; | 335 | return newsk; |
317 | out_err: | 336 | out_err: |
318 | newsk = NULL; | 337 | newsk = NULL; |
338 | req = NULL; | ||
319 | *err = error; | 339 | *err = error; |
320 | goto out; | 340 | goto out; |
321 | } | 341 | } |
@@ -720,13 +740,14 @@ EXPORT_SYMBOL_GPL(inet_csk_listen_start); | |||
720 | void inet_csk_listen_stop(struct sock *sk) | 740 | void inet_csk_listen_stop(struct sock *sk) |
721 | { | 741 | { |
722 | struct inet_connection_sock *icsk = inet_csk(sk); | 742 | struct inet_connection_sock *icsk = inet_csk(sk); |
743 | struct request_sock_queue *queue = &icsk->icsk_accept_queue; | ||
723 | struct request_sock *acc_req; | 744 | struct request_sock *acc_req; |
724 | struct request_sock *req; | 745 | struct request_sock *req; |
725 | 746 | ||
726 | inet_csk_delete_keepalive_timer(sk); | 747 | inet_csk_delete_keepalive_timer(sk); |
727 | 748 | ||
728 | /* make all the listen_opt local to us */ | 749 | /* make all the listen_opt local to us */ |
729 | acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue); | 750 | acc_req = reqsk_queue_yank_acceptq(queue); |
730 | 751 | ||
731 | /* Following specs, it would be better either to send FIN | 752 | /* Following specs, it would be better either to send FIN |
732 | * (and enter FIN-WAIT-1, it is normal close) | 753 | * (and enter FIN-WAIT-1, it is normal close) |
@@ -736,7 +757,7 @@ void inet_csk_listen_stop(struct sock *sk) | |||
736 | * To be honest, we are not able to make either | 757 | * To be honest, we are not able to make either |
737 | * of the variants now. --ANK | 758 | * of the variants now. --ANK |
738 | */ | 759 | */ |
739 | reqsk_queue_destroy(&icsk->icsk_accept_queue); | 760 | reqsk_queue_destroy(queue); |
740 | 761 | ||
741 | while ((req = acc_req) != NULL) { | 762 | while ((req = acc_req) != NULL) { |
742 | struct sock *child = req->sk; | 763 | struct sock *child = req->sk; |
@@ -754,6 +775,19 @@ void inet_csk_listen_stop(struct sock *sk) | |||
754 | 775 | ||
755 | percpu_counter_inc(sk->sk_prot->orphan_count); | 776 | percpu_counter_inc(sk->sk_prot->orphan_count); |
756 | 777 | ||
778 | if (sk->sk_type == SOCK_STREAM && tcp_rsk(req)->listener) { | ||
779 | BUG_ON(tcp_sk(child)->fastopen_rsk != req); | ||
780 | BUG_ON(sk != tcp_rsk(req)->listener); | ||
781 | |||
782 | /* Paranoid, to prevent race condition if | ||
783 | * an inbound pkt destined for child is | ||
784 | * blocked by sock lock in tcp_v4_rcv(). | ||
785 | * Also to satisfy an assertion in | ||
786 | * tcp_v4_destroy_sock(). | ||
787 | */ | ||
788 | tcp_sk(child)->fastopen_rsk = NULL; | ||
789 | sock_put(sk); | ||
790 | } | ||
757 | inet_csk_destroy_sock(child); | 791 | inet_csk_destroy_sock(child); |
758 | 792 | ||
759 | bh_unlock_sock(child); | 793 | bh_unlock_sock(child); |
@@ -763,6 +797,17 @@ void inet_csk_listen_stop(struct sock *sk) | |||
763 | sk_acceptq_removed(sk); | 797 | sk_acceptq_removed(sk); |
764 | __reqsk_free(req); | 798 | __reqsk_free(req); |
765 | } | 799 | } |
800 | if (queue->fastopenq != NULL) { | ||
801 | /* Free all the reqs queued in rskq_rst_head. */ | ||
802 | spin_lock_bh(&queue->fastopenq->lock); | ||
803 | acc_req = queue->fastopenq->rskq_rst_head; | ||
804 | queue->fastopenq->rskq_rst_head = NULL; | ||
805 | spin_unlock_bh(&queue->fastopenq->lock); | ||
806 | while ((req = acc_req) != NULL) { | ||
807 | acc_req = req->dl_next; | ||
808 | __reqsk_free(req); | ||
809 | } | ||
810 | } | ||
766 | WARN_ON(sk->sk_ack_backlog); | 811 | WARN_ON(sk->sk_ack_backlog); |
767 | } | 812 | } |
768 | EXPORT_SYMBOL_GPL(inet_csk_listen_stop); | 813 | EXPORT_SYMBOL_GPL(inet_csk_listen_stop); |
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 650e1528e1e6..ba48e799b031 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c | |||
@@ -319,6 +319,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, | |||
319 | ireq->tstamp_ok = tcp_opt.saw_tstamp; | 319 | ireq->tstamp_ok = tcp_opt.saw_tstamp; |
320 | req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; | 320 | req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; |
321 | treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; | 321 | treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; |
322 | treq->listener = NULL; | ||
322 | 323 | ||
323 | /* We throwed the options of the initial SYN away, so we hope | 324 | /* We throwed the options of the initial SYN away, so we hope |
324 | * the ACK carries the same options again (see RFC1122 4.2.3.8) | 325 | * the ACK carries the same options again (see RFC1122 4.2.3.8) |
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2109ff4a1daf..df83d744e380 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -486,8 +486,9 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) | |||
486 | if (sk->sk_shutdown & RCV_SHUTDOWN) | 486 | if (sk->sk_shutdown & RCV_SHUTDOWN) |
487 | mask |= POLLIN | POLLRDNORM | POLLRDHUP; | 487 | mask |= POLLIN | POLLRDNORM | POLLRDHUP; |
488 | 488 | ||
489 | /* Connected? */ | 489 | /* Connected or passive Fast Open socket? */ |
490 | if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { | 490 | if (sk->sk_state != TCP_SYN_SENT && |
491 | (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) { | ||
491 | int target = sock_rcvlowat(sk, 0, INT_MAX); | 492 | int target = sock_rcvlowat(sk, 0, INT_MAX); |
492 | 493 | ||
493 | if (tp->urg_seq == tp->copied_seq && | 494 | if (tp->urg_seq == tp->copied_seq && |
@@ -840,10 +841,15 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse | |||
840 | ssize_t copied; | 841 | ssize_t copied; |
841 | long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); | 842 | long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); |
842 | 843 | ||
843 | /* Wait for a connection to finish. */ | 844 | /* Wait for a connection to finish. One exception is TCP Fast Open |
844 | if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) | 845 | * (passive side) where data is allowed to be sent before a connection |
846 | * is fully established. | ||
847 | */ | ||
848 | if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && | ||
849 | !tcp_passive_fastopen(sk)) { | ||
845 | if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) | 850 | if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) |
846 | goto out_err; | 851 | goto out_err; |
852 | } | ||
847 | 853 | ||
848 | clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); | 854 | clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); |
849 | 855 | ||
@@ -1042,10 +1048,15 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
1042 | 1048 | ||
1043 | timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); | 1049 | timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); |
1044 | 1050 | ||
1045 | /* Wait for a connection to finish. */ | 1051 | /* Wait for a connection to finish. One exception is TCP Fast Open |
1046 | if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) | 1052 | * (passive side) where data is allowed to be sent before a connection |
1053 | * is fully established. | ||
1054 | */ | ||
1055 | if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && | ||
1056 | !tcp_passive_fastopen(sk)) { | ||
1047 | if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) | 1057 | if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) |
1048 | goto do_error; | 1058 | goto do_error; |
1059 | } | ||
1049 | 1060 | ||
1050 | if (unlikely(tp->repair)) { | 1061 | if (unlikely(tp->repair)) { |
1051 | if (tp->repair_queue == TCP_RECV_QUEUE) { | 1062 | if (tp->repair_queue == TCP_RECV_QUEUE) { |
@@ -2144,6 +2155,10 @@ void tcp_close(struct sock *sk, long timeout) | |||
2144 | * they look as CLOSING or LAST_ACK for Linux) | 2155 | * they look as CLOSING or LAST_ACK for Linux) |
2145 | * Probably, I missed some more holelets. | 2156 | * Probably, I missed some more holelets. |
2146 | * --ANK | 2157 | * --ANK |
2158 | * XXX (TFO) - To start off we don't support SYN+ACK+FIN | ||
2159 | * in a single packet! (May consider it later but will | ||
2160 | * probably need API support or TCP_CORK SYN-ACK until | ||
2161 | * data is written and socket is closed.) | ||
2147 | */ | 2162 | */ |
2148 | tcp_send_fin(sk); | 2163 | tcp_send_fin(sk); |
2149 | } | 2164 | } |
@@ -2215,8 +2230,16 @@ adjudge_to_death: | |||
2215 | } | 2230 | } |
2216 | } | 2231 | } |
2217 | 2232 | ||
2218 | if (sk->sk_state == TCP_CLOSE) | 2233 | if (sk->sk_state == TCP_CLOSE) { |
2234 | struct request_sock *req = tcp_sk(sk)->fastopen_rsk; | ||
2235 | /* We could get here with a non-NULL req if the socket is | ||
2236 | * aborted (e.g., closed with unread data) before 3WHS | ||
2237 | * finishes. | ||
2238 | */ | ||
2239 | if (req != NULL) | ||
2240 | reqsk_fastopen_remove(sk, req, false); | ||
2219 | inet_csk_destroy_sock(sk); | 2241 | inet_csk_destroy_sock(sk); |
2242 | } | ||
2220 | /* Otherwise, socket is reprieved until protocol close. */ | 2243 | /* Otherwise, socket is reprieved until protocol close. */ |
2221 | 2244 | ||
2222 | out: | 2245 | out: |
@@ -2688,6 +2711,14 @@ static int do_tcp_setsockopt(struct sock *sk, int level, | |||
2688 | else | 2711 | else |
2689 | icsk->icsk_user_timeout = msecs_to_jiffies(val); | 2712 | icsk->icsk_user_timeout = msecs_to_jiffies(val); |
2690 | break; | 2713 | break; |
2714 | |||
2715 | case TCP_FASTOPEN: | ||
2716 | if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | | ||
2717 | TCPF_LISTEN))) | ||
2718 | err = fastopen_init_queue(sk, val); | ||
2719 | else | ||
2720 | err = -EINVAL; | ||
2721 | break; | ||
2691 | default: | 2722 | default: |
2692 | err = -ENOPROTOOPT; | 2723 | err = -ENOPROTOOPT; |
2693 | break; | 2724 | break; |
@@ -3501,11 +3532,15 @@ EXPORT_SYMBOL(tcp_cookie_generator); | |||
3501 | 3532 | ||
3502 | void tcp_done(struct sock *sk) | 3533 | void tcp_done(struct sock *sk) |
3503 | { | 3534 | { |
3535 | struct request_sock *req = tcp_sk(sk)->fastopen_rsk; | ||
3536 | |||
3504 | if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) | 3537 | if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) |
3505 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); | 3538 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); |
3506 | 3539 | ||
3507 | tcp_set_state(sk, TCP_CLOSE); | 3540 | tcp_set_state(sk, TCP_CLOSE); |
3508 | tcp_clear_xmit_timers(sk); | 3541 | tcp_clear_xmit_timers(sk); |
3542 | if (req != NULL) | ||
3543 | reqsk_fastopen_remove(sk, req, false); | ||
3509 | 3544 | ||
3510 | sk->sk_shutdown = SHUTDOWN_MASK; | 3545 | sk->sk_shutdown = SHUTDOWN_MASK; |
3511 | 3546 | ||
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 36f02f954ac1..bb148dee1edd 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -839,7 +839,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, | |||
839 | if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) | 839 | if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) |
840 | return -1; | 840 | return -1; |
841 | 841 | ||
842 | skb = tcp_make_synack(sk, dst, req, rvp); | 842 | skb = tcp_make_synack(sk, dst, req, rvp, NULL); |
843 | 843 | ||
844 | if (skb) { | 844 | if (skb) { |
845 | __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr); | 845 | __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr); |
@@ -1554,7 +1554,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) | |||
1554 | struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, | 1554 | struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, |
1555 | iph->saddr, iph->daddr); | 1555 | iph->saddr, iph->daddr); |
1556 | if (req) | 1556 | if (req) |
1557 | return tcp_check_req(sk, skb, req, prev); | 1557 | return tcp_check_req(sk, skb, req, prev, false); |
1558 | 1558 | ||
1559 | nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, | 1559 | nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, |
1560 | th->source, iph->daddr, th->dest, inet_iif(skb)); | 1560 | th->source, iph->daddr, th->dest, inet_iif(skb)); |
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 6ff7f10dce9d..e965319d610b 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c | |||
@@ -507,6 +507,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
507 | newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; | 507 | newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; |
508 | newtp->rx_opt.mss_clamp = req->mss; | 508 | newtp->rx_opt.mss_clamp = req->mss; |
509 | TCP_ECN_openreq_child(newtp, req); | 509 | TCP_ECN_openreq_child(newtp, req); |
510 | newtp->fastopen_rsk = NULL; | ||
510 | 511 | ||
511 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); | 512 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); |
512 | } | 513 | } |
@@ -515,13 +516,18 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
515 | EXPORT_SYMBOL(tcp_create_openreq_child); | 516 | EXPORT_SYMBOL(tcp_create_openreq_child); |
516 | 517 | ||
517 | /* | 518 | /* |
518 | * Process an incoming packet for SYN_RECV sockets represented | 519 | * Process an incoming packet for SYN_RECV sockets represented as a |
519 | * as a request_sock. | 520 | * request_sock. Normally sk is the listener socket but for TFO it |
521 | * points to the child socket. | ||
522 | * | ||
523 | * XXX (TFO) - The current impl contains a special check for ack | ||
524 | * validation and inside tcp_v4_reqsk_send_ack(). Can we do better? | ||
520 | */ | 525 | */ |
521 | 526 | ||
522 | struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | 527 | struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, |
523 | struct request_sock *req, | 528 | struct request_sock *req, |
524 | struct request_sock **prev) | 529 | struct request_sock **prev, |
530 | bool fastopen) | ||
525 | { | 531 | { |
526 | struct tcp_options_received tmp_opt; | 532 | struct tcp_options_received tmp_opt; |
527 | const u8 *hash_location; | 533 | const u8 *hash_location; |
@@ -530,6 +536,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | |||
530 | __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); | 536 | __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); |
531 | bool paws_reject = false; | 537 | bool paws_reject = false; |
532 | 538 | ||
539 | BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN)); | ||
540 | |||
533 | tmp_opt.saw_tstamp = 0; | 541 | tmp_opt.saw_tstamp = 0; |
534 | if (th->doff > (sizeof(struct tcphdr)>>2)) { | 542 | if (th->doff > (sizeof(struct tcphdr)>>2)) { |
535 | tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); | 543 | tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); |
@@ -565,6 +573,9 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | |||
565 | * | 573 | * |
566 | * Enforce "SYN-ACK" according to figure 8, figure 6 | 574 | * Enforce "SYN-ACK" according to figure 8, figure 6 |
567 | * of RFC793, fixed by RFC1122. | 575 | * of RFC793, fixed by RFC1122. |
576 | * | ||
577 | * Note that even if there is new data in the SYN packet | ||
578 | * they will be thrown away too. | ||
568 | */ | 579 | */ |
569 | req->rsk_ops->rtx_syn_ack(sk, req, NULL); | 580 | req->rsk_ops->rtx_syn_ack(sk, req, NULL); |
570 | return NULL; | 581 | return NULL; |
@@ -622,9 +633,12 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | |||
622 | * sent (the segment carries an unacceptable ACK) ... | 633 | * sent (the segment carries an unacceptable ACK) ... |
623 | * a reset is sent." | 634 | * a reset is sent." |
624 | * | 635 | * |
625 | * Invalid ACK: reset will be sent by listening socket | 636 | * Invalid ACK: reset will be sent by listening socket. |
637 | * Note that the ACK validity check for a Fast Open socket is done | ||
638 | * elsewhere and is checked directly against the child socket rather | ||
639 | * than req because user data may have been sent out. | ||
626 | */ | 640 | */ |
627 | if ((flg & TCP_FLAG_ACK) && | 641 | if ((flg & TCP_FLAG_ACK) && !fastopen && |
628 | (TCP_SKB_CB(skb)->ack_seq != | 642 | (TCP_SKB_CB(skb)->ack_seq != |
629 | tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk)))) | 643 | tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk)))) |
630 | return sk; | 644 | return sk; |
@@ -637,7 +651,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | |||
637 | /* RFC793: "first check sequence number". */ | 651 | /* RFC793: "first check sequence number". */ |
638 | 652 | ||
639 | if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, | 653 | if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, |
640 | tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) { | 654 | tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) { |
641 | /* Out of window: send ACK and drop. */ | 655 | /* Out of window: send ACK and drop. */ |
642 | if (!(flg & TCP_FLAG_RST)) | 656 | if (!(flg & TCP_FLAG_RST)) |
643 | req->rsk_ops->send_ack(sk, skb, req); | 657 | req->rsk_ops->send_ack(sk, skb, req); |
@@ -648,7 +662,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | |||
648 | 662 | ||
649 | /* In sequence, PAWS is OK. */ | 663 | /* In sequence, PAWS is OK. */ |
650 | 664 | ||
651 | if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1)) | 665 | if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) |
652 | req->ts_recent = tmp_opt.rcv_tsval; | 666 | req->ts_recent = tmp_opt.rcv_tsval; |
653 | 667 | ||
654 | if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { | 668 | if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { |
@@ -667,10 +681,19 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | |||
667 | 681 | ||
668 | /* ACK sequence verified above, just make sure ACK is | 682 | /* ACK sequence verified above, just make sure ACK is |
669 | * set. If ACK not set, just silently drop the packet. | 683 | * set. If ACK not set, just silently drop the packet. |
684 | * | ||
685 | * XXX (TFO) - if we ever allow "data after SYN", the | ||
686 | * following check needs to be removed. | ||
670 | */ | 687 | */ |
671 | if (!(flg & TCP_FLAG_ACK)) | 688 | if (!(flg & TCP_FLAG_ACK)) |
672 | return NULL; | 689 | return NULL; |
673 | 690 | ||
691 | /* For Fast Open no more processing is needed (sk is the | ||
692 | * child socket). | ||
693 | */ | ||
694 | if (fastopen) | ||
695 | return sk; | ||
696 | |||
674 | /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ | 697 | /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ |
675 | if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && | 698 | if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && |
676 | TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { | 699 | TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { |
@@ -706,11 +729,21 @@ listen_overflow: | |||
706 | } | 729 | } |
707 | 730 | ||
708 | embryonic_reset: | 731 | embryonic_reset: |
709 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); | 732 | if (!(flg & TCP_FLAG_RST)) { |
710 | if (!(flg & TCP_FLAG_RST)) | 733 | /* Received a bad SYN pkt - for TFO We try not to reset |
734 | * the local connection unless it's really necessary to | ||
735 | * avoid becoming vulnerable to outside attack aiming at | ||
736 | * resetting legit local connections. | ||
737 | */ | ||
711 | req->rsk_ops->send_reset(sk, skb); | 738 | req->rsk_ops->send_reset(sk, skb); |
712 | 739 | } else if (fastopen) { /* received a valid RST pkt */ | |
713 | inet_csk_reqsk_queue_drop(sk, req, prev); | 740 | reqsk_fastopen_remove(sk, req, true); |
741 | tcp_reset(sk); | ||
742 | } | ||
743 | if (!fastopen) { | ||
744 | inet_csk_reqsk_queue_drop(sk, req, prev); | ||
745 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); | ||
746 | } | ||
714 | return NULL; | 747 | return NULL; |
715 | } | 748 | } |
716 | EXPORT_SYMBOL(tcp_check_req); | 749 | EXPORT_SYMBOL(tcp_check_req); |
@@ -719,6 +752,12 @@ EXPORT_SYMBOL(tcp_check_req); | |||
719 | * Queue segment on the new socket if the new socket is active, | 752 | * Queue segment on the new socket if the new socket is active, |
720 | * otherwise we just shortcircuit this and continue with | 753 | * otherwise we just shortcircuit this and continue with |
721 | * the new socket. | 754 | * the new socket. |
755 | * | ||
756 | * For the vast majority of cases child->sk_state will be TCP_SYN_RECV | ||
757 | * when entering. But other states are possible due to a race condition | ||
758 | * where after __inet_lookup_established() fails but before the listener | ||
759 | * locked is obtained, other packets cause the same connection to | ||
760 | * be created. | ||
722 | */ | 761 | */ |
723 | 762 | ||
724 | int tcp_child_process(struct sock *parent, struct sock *child, | 763 | int tcp_child_process(struct sock *parent, struct sock *child, |
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d04632673a9e..9383b51f3efc 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -702,7 +702,8 @@ static unsigned int tcp_synack_options(struct sock *sk, | |||
702 | unsigned int mss, struct sk_buff *skb, | 702 | unsigned int mss, struct sk_buff *skb, |
703 | struct tcp_out_options *opts, | 703 | struct tcp_out_options *opts, |
704 | struct tcp_md5sig_key **md5, | 704 | struct tcp_md5sig_key **md5, |
705 | struct tcp_extend_values *xvp) | 705 | struct tcp_extend_values *xvp, |
706 | struct tcp_fastopen_cookie *foc) | ||
706 | { | 707 | { |
707 | struct inet_request_sock *ireq = inet_rsk(req); | 708 | struct inet_request_sock *ireq = inet_rsk(req); |
708 | unsigned int remaining = MAX_TCP_OPTION_SPACE; | 709 | unsigned int remaining = MAX_TCP_OPTION_SPACE; |
@@ -747,7 +748,15 @@ static unsigned int tcp_synack_options(struct sock *sk, | |||
747 | if (unlikely(!ireq->tstamp_ok)) | 748 | if (unlikely(!ireq->tstamp_ok)) |
748 | remaining -= TCPOLEN_SACKPERM_ALIGNED; | 749 | remaining -= TCPOLEN_SACKPERM_ALIGNED; |
749 | } | 750 | } |
750 | 751 | if (foc != NULL) { | |
752 | u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; | ||
753 | need = (need + 3) & ~3U; /* Align to 32 bits */ | ||
754 | if (remaining >= need) { | ||
755 | opts->options |= OPTION_FAST_OPEN_COOKIE; | ||
756 | opts->fastopen_cookie = foc; | ||
757 | remaining -= need; | ||
758 | } | ||
759 | } | ||
751 | /* Similar rationale to tcp_syn_options() applies here, too. | 760 | /* Similar rationale to tcp_syn_options() applies here, too. |
752 | * If the <SYN> options fit, the same options should fit now! | 761 | * If the <SYN> options fit, the same options should fit now! |
753 | */ | 762 | */ |
@@ -2658,7 +2667,8 @@ int tcp_send_synack(struct sock *sk) | |||
2658 | */ | 2667 | */ |
2659 | struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | 2668 | struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, |
2660 | struct request_sock *req, | 2669 | struct request_sock *req, |
2661 | struct request_values *rvp) | 2670 | struct request_values *rvp, |
2671 | struct tcp_fastopen_cookie *foc) | ||
2662 | { | 2672 | { |
2663 | struct tcp_out_options opts; | 2673 | struct tcp_out_options opts; |
2664 | struct tcp_extend_values *xvp = tcp_xv(rvp); | 2674 | struct tcp_extend_values *xvp = tcp_xv(rvp); |
@@ -2718,7 +2728,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
2718 | #endif | 2728 | #endif |
2719 | TCP_SKB_CB(skb)->when = tcp_time_stamp; | 2729 | TCP_SKB_CB(skb)->when = tcp_time_stamp; |
2720 | tcp_header_size = tcp_synack_options(sk, req, mss, | 2730 | tcp_header_size = tcp_synack_options(sk, req, mss, |
2721 | skb, &opts, &md5, xvp) | 2731 | skb, &opts, &md5, xvp, foc) |
2722 | + sizeof(*th); | 2732 | + sizeof(*th); |
2723 | 2733 | ||
2724 | skb_push(skb, tcp_header_size); | 2734 | skb_push(skb, tcp_header_size); |
@@ -2772,7 +2782,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
2772 | } | 2782 | } |
2773 | 2783 | ||
2774 | th->seq = htonl(TCP_SKB_CB(skb)->seq); | 2784 | th->seq = htonl(TCP_SKB_CB(skb)->seq); |
2775 | th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); | 2785 | /* XXX data is queued and acked as is. No buffer/window check */ |
2786 | th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt); | ||
2776 | 2787 | ||
2777 | /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ | 2788 | /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ |
2778 | th->window = htons(min(req->rcv_wnd, 65535U)); | 2789 | th->window = htons(min(req->rcv_wnd, 65535U)); |
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index b774a03bd1dc..fc04711e80c8 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c | |||
@@ -305,6 +305,35 @@ static void tcp_probe_timer(struct sock *sk) | |||
305 | } | 305 | } |
306 | 306 | ||
307 | /* | 307 | /* |
308 | * Timer for Fast Open socket to retransmit SYNACK. Note that the | ||
309 | * sk here is the child socket, not the parent (listener) socket. | ||
310 | */ | ||
311 | static void tcp_fastopen_synack_timer(struct sock *sk) | ||
312 | { | ||
313 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
314 | int max_retries = icsk->icsk_syn_retries ? : | ||
315 | sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */ | ||
316 | struct request_sock *req; | ||
317 | |||
318 | req = tcp_sk(sk)->fastopen_rsk; | ||
319 | req->rsk_ops->syn_ack_timeout(sk, req); | ||
320 | |||
321 | if (req->retrans >= max_retries) { | ||
322 | tcp_write_err(sk); | ||
323 | return; | ||
324 | } | ||
325 | /* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error | ||
326 | * returned from rtx_syn_ack() to make it more persistent like | ||
327 | * regular retransmit because if the child socket has been accepted | ||
328 | * it's not good to give up too easily. | ||
329 | */ | ||
330 | req->rsk_ops->rtx_syn_ack(sk, req, NULL); | ||
331 | req->retrans++; | ||
332 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, | ||
333 | TCP_TIMEOUT_INIT << req->retrans, TCP_RTO_MAX); | ||
334 | } | ||
335 | |||
336 | /* | ||
308 | * The TCP retransmit timer. | 337 | * The TCP retransmit timer. |
309 | */ | 338 | */ |
310 | 339 | ||
@@ -317,7 +346,15 @@ void tcp_retransmit_timer(struct sock *sk) | |||
317 | tcp_resume_early_retransmit(sk); | 346 | tcp_resume_early_retransmit(sk); |
318 | return; | 347 | return; |
319 | } | 348 | } |
320 | 349 | if (tp->fastopen_rsk) { | |
350 | BUG_ON(sk->sk_state != TCP_SYN_RECV && | ||
351 | sk->sk_state != TCP_FIN_WAIT1); | ||
352 | tcp_fastopen_synack_timer(sk); | ||
353 | /* Before we receive ACK to our SYN-ACK don't retransmit | ||
354 | * anything else (e.g., data or FIN segments). | ||
355 | */ | ||
356 | return; | ||
357 | } | ||
321 | if (!tp->packets_out) | 358 | if (!tp->packets_out) |
322 | goto out; | 359 | goto out; |
323 | 360 | ||