aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorJerry Chu <hkchu@google.com>2012-08-31 08:29:12 -0400
committerDavid S. Miller <davem@davemloft.net>2012-08-31 20:02:19 -0400
commit8336886f786fdacbc19b719c1f7ea91eb70706d4 (patch)
treec1fa912f7583ce0ffcb5ae673802da4a7dfb3b19 /net
parent1046716368979dee857a2b8a91c4a8833f21b9cb (diff)
tcp: TCP Fast Open Server - support TFO listeners
This patch builds on top of the previous patch to add the support for TFO listeners. This includes - 1. allocating, properly initializing, and managing the per listener fastopen_queue structure when TFO is enabled 2. changes to the inet_csk_accept code to support TFO. E.g., the request_sock can no longer be freed upon accept(), not until 3WHS finishes 3. allowing a TCP_SYN_RECV socket to properly poll() and sendmsg() if it's a TFO socket 4. properly closing a TFO listener, and a TFO socket before 3WHS finishes 5. supporting TCP_FASTOPEN socket option 6. modifying tcp_check_req() to use to check a TFO socket as well as request_sock 7. supporting TCP's TFO cookie option 8. adding a new SYN-ACK retransmit handler to use the timer directly off the TFO socket rather than the listener socket. Note that TFO server side will not retransmit anything other than SYN-ACK until the 3WHS is completed. The patch also contains an important function "reqsk_fastopen_remove()" to manage the somewhat complex relation between a listener, its request_sock, and the corresponding child socket. See the comment above the function for the detail. Signed-off-by: H.K. Jerry Chu <hkchu@google.com> Cc: Yuchung Cheng <ycheng@google.com> Cc: Neal Cardwell <ncardwell@google.com> Cc: Eric Dumazet <edumazet@google.com> Cc: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/core/request_sock.c95
-rw-r--r--net/ipv4/af_inet.c28
-rw-r--r--net/ipv4/inet_connection_sock.c57
-rw-r--r--net/ipv4/syncookies.c1
-rw-r--r--net/ipv4/tcp.c49
-rw-r--r--net/ipv4/tcp_ipv4.c4
-rw-r--r--net/ipv4/tcp_minisocks.c61
-rw-r--r--net/ipv4/tcp_output.c21
-rw-r--r--net/ipv4/tcp_timer.c39
-rw-r--r--net/ipv6/syncookies.c1
-rw-r--r--net/ipv6/tcp_ipv6.c5
11 files changed, 326 insertions, 35 deletions
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 9b570a6a33c5..c31d9e8668c3 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -15,6 +15,7 @@
15#include <linux/random.h> 15#include <linux/random.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/string.h> 17#include <linux/string.h>
18#include <linux/tcp.h>
18#include <linux/vmalloc.h> 19#include <linux/vmalloc.h>
19 20
20#include <net/request_sock.h> 21#include <net/request_sock.h>
@@ -130,3 +131,97 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
130 kfree(lopt); 131 kfree(lopt);
131} 132}
132 133
134/*
135 * This function is called to set a Fast Open socket's "fastopen_rsk" field
136 * to NULL when a TFO socket no longer needs to access the request_sock.
137 * This happens only after 3WHS has been either completed or aborted (e.g.,
138 * RST is received).
139 *
140 * Before TFO, a child socket is created only after 3WHS is completed,
141 * hence it never needs to access the request_sock. things get a lot more
142 * complex with TFO. A child socket, accepted or not, has to access its
143 * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts,
144 * until 3WHS is either completed or aborted. Afterwards the req will stay
145 * until either the child socket is accepted, or in the rare case when the
146 * listener is closed before the child is accepted.
147 *
148 * In short, a request socket is only freed after BOTH 3WHS has completed
149 * (or aborted) and the child socket has been accepted (or listener closed).
150 * When a child socket is accepted, its corresponding req->sk is set to
151 * NULL since it's no longer needed. More importantly, "req->sk == NULL"
152 * will be used by the code below to determine if a child socket has been
153 * accepted or not, and the check is protected by the fastopenq->lock
154 * described below.
155 *
156 * Note that fastopen_rsk is only accessed from the child socket's context
157 * with its socket lock held. But a request_sock (req) can be accessed by
158 * both its child socket through fastopen_rsk, and a listener socket through
159 * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin
160 * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created.
161 * only in the rare case when both the listener and the child locks are held,
162 * e.g., in inet_csk_listen_stop() do we not need to acquire the lock.
163 * The lock also protects other fields such as fastopenq->qlen, which is
164 * decremented by this function when fastopen_rsk is no longer needed.
165 *
166 * Note that another solution was to simply use the existing socket lock
167 * from the listener. But first socket lock is difficult to use. It is not
168 * a simple spin lock - one must consider sock_owned_by_user() and arrange
169 * to use sk_add_backlog() stuff. But what really makes it infeasible is the
170 * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to
171 * acquire a child's lock while holding listener's socket lock. A corner
172 * case might also exist in tcp_v4_hnd_req() that will trigger this locking
173 * order.
174 *
175 * When a TFO req is created, it needs to sock_hold its listener to prevent
176 * the latter data structure from going away.
177 *
178 * This function also sets "treq->listener" to NULL and unreference listener
179 * socket. treq->listener is used by the listener so it is protected by the
180 * fastopenq->lock in this function.
181 */
182void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
183 bool reset)
184{
185 struct sock *lsk = tcp_rsk(req)->listener;
186 struct fastopen_queue *fastopenq =
187 inet_csk(lsk)->icsk_accept_queue.fastopenq;
188
189 BUG_ON(!spin_is_locked(&sk->sk_lock.slock) && !sock_owned_by_user(sk));
190
191 tcp_sk(sk)->fastopen_rsk = NULL;
192 spin_lock_bh(&fastopenq->lock);
193 fastopenq->qlen--;
194 tcp_rsk(req)->listener = NULL;
195 if (req->sk) /* the child socket hasn't been accepted yet */
196 goto out;
197
198 if (!reset || lsk->sk_state != TCP_LISTEN) {
199 /* If the listener has been closed don't bother with the
200 * special RST handling below.
201 */
202 spin_unlock_bh(&fastopenq->lock);
203 sock_put(lsk);
204 reqsk_free(req);
205 return;
206 }
207 /* Wait for 60secs before removing a req that has triggered RST.
208 * This is a simple defense against TFO spoofing attack - by
209 * counting the req against fastopen.max_qlen, and disabling
210 * TFO when the qlen exceeds max_qlen.
211 *
212 * For more details see CoNext'11 "TCP Fast Open" paper.
213 */
214 req->expires = jiffies + 60*HZ;
215 if (fastopenq->rskq_rst_head == NULL)
216 fastopenq->rskq_rst_head = req;
217 else
218 fastopenq->rskq_rst_tail->dl_next = req;
219
220 req->dl_next = NULL;
221 fastopenq->rskq_rst_tail = req;
222 fastopenq->qlen++;
223out:
224 spin_unlock_bh(&fastopenq->lock);
225 sock_put(lsk);
226 return;
227}
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 6681ccf5c3ee..4f70ef0b946d 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -149,6 +149,11 @@ void inet_sock_destruct(struct sock *sk)
149 pr_err("Attempt to release alive inet socket %p\n", sk); 149 pr_err("Attempt to release alive inet socket %p\n", sk);
150 return; 150 return;
151 } 151 }
152 if (sk->sk_type == SOCK_STREAM) {
153 struct fastopen_queue *fastopenq =
154 inet_csk(sk)->icsk_accept_queue.fastopenq;
155 kfree(fastopenq);
156 }
152 157
153 WARN_ON(atomic_read(&sk->sk_rmem_alloc)); 158 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
154 WARN_ON(atomic_read(&sk->sk_wmem_alloc)); 159 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
@@ -212,6 +217,26 @@ int inet_listen(struct socket *sock, int backlog)
212 * we can only allow the backlog to be adjusted. 217 * we can only allow the backlog to be adjusted.
213 */ 218 */
214 if (old_state != TCP_LISTEN) { 219 if (old_state != TCP_LISTEN) {
220 /* Check special setups for testing purpose to enable TFO w/o
221 * requiring TCP_FASTOPEN sockopt.
222 * Note that only TCP sockets (SOCK_STREAM) will reach here.
223 * Also fastopenq may already been allocated because this
224 * socket was in TCP_LISTEN state previously but was
225 * shutdown() (rather than close()).
226 */
227 if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 &&
228 inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) {
229 if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0)
230 err = fastopen_init_queue(sk, backlog);
231 else if ((sysctl_tcp_fastopen &
232 TFO_SERVER_WO_SOCKOPT2) != 0)
233 err = fastopen_init_queue(sk,
234 ((uint)sysctl_tcp_fastopen) >> 16);
235 else
236 err = 0;
237 if (err)
238 goto out;
239 }
215 err = inet_csk_listen_start(sk, backlog); 240 err = inet_csk_listen_start(sk, backlog);
216 if (err) 241 if (err)
217 goto out; 242 goto out;
@@ -701,7 +726,8 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)
701 726
702 sock_rps_record_flow(sk2); 727 sock_rps_record_flow(sk2);
703 WARN_ON(!((1 << sk2->sk_state) & 728 WARN_ON(!((1 << sk2->sk_state) &
704 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE))); 729 (TCPF_ESTABLISHED | TCPF_SYN_RECV |
730 TCPF_CLOSE_WAIT | TCPF_CLOSE)));
705 731
706 sock_graft(sk2, newsock); 732 sock_graft(sk2, newsock);
707 733
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 7f75f21d7b83..8464b79c493f 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -283,7 +283,9 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
283struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) 283struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
284{ 284{
285 struct inet_connection_sock *icsk = inet_csk(sk); 285 struct inet_connection_sock *icsk = inet_csk(sk);
286 struct request_sock_queue *queue = &icsk->icsk_accept_queue;
286 struct sock *newsk; 287 struct sock *newsk;
288 struct request_sock *req;
287 int error; 289 int error;
288 290
289 lock_sock(sk); 291 lock_sock(sk);
@@ -296,7 +298,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
296 goto out_err; 298 goto out_err;
297 299
298 /* Find already established connection */ 300 /* Find already established connection */
299 if (reqsk_queue_empty(&icsk->icsk_accept_queue)) { 301 if (reqsk_queue_empty(queue)) {
300 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); 302 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
301 303
302 /* If this is a non blocking socket don't sleep */ 304 /* If this is a non blocking socket don't sleep */
@@ -308,14 +310,32 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
308 if (error) 310 if (error)
309 goto out_err; 311 goto out_err;
310 } 312 }
311 313 req = reqsk_queue_remove(queue);
312 newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk); 314 newsk = req->sk;
313 WARN_ON(newsk->sk_state == TCP_SYN_RECV); 315
316 sk_acceptq_removed(sk);
317 if (sk->sk_type == SOCK_STREAM && queue->fastopenq != NULL) {
318 spin_lock_bh(&queue->fastopenq->lock);
319 if (tcp_rsk(req)->listener) {
320 /* We are still waiting for the final ACK from 3WHS
321 * so can't free req now. Instead, we set req->sk to
322 * NULL to signify that the child socket is taken
323 * so reqsk_fastopen_remove() will free the req
324 * when 3WHS finishes (or is aborted).
325 */
326 req->sk = NULL;
327 req = NULL;
328 }
329 spin_unlock_bh(&queue->fastopenq->lock);
330 }
314out: 331out:
315 release_sock(sk); 332 release_sock(sk);
333 if (req)
334 __reqsk_free(req);
316 return newsk; 335 return newsk;
317out_err: 336out_err:
318 newsk = NULL; 337 newsk = NULL;
338 req = NULL;
319 *err = error; 339 *err = error;
320 goto out; 340 goto out;
321} 341}
@@ -720,13 +740,14 @@ EXPORT_SYMBOL_GPL(inet_csk_listen_start);
720void inet_csk_listen_stop(struct sock *sk) 740void inet_csk_listen_stop(struct sock *sk)
721{ 741{
722 struct inet_connection_sock *icsk = inet_csk(sk); 742 struct inet_connection_sock *icsk = inet_csk(sk);
743 struct request_sock_queue *queue = &icsk->icsk_accept_queue;
723 struct request_sock *acc_req; 744 struct request_sock *acc_req;
724 struct request_sock *req; 745 struct request_sock *req;
725 746
726 inet_csk_delete_keepalive_timer(sk); 747 inet_csk_delete_keepalive_timer(sk);
727 748
728 /* make all the listen_opt local to us */ 749 /* make all the listen_opt local to us */
729 acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue); 750 acc_req = reqsk_queue_yank_acceptq(queue);
730 751
731 /* Following specs, it would be better either to send FIN 752 /* Following specs, it would be better either to send FIN
732 * (and enter FIN-WAIT-1, it is normal close) 753 * (and enter FIN-WAIT-1, it is normal close)
@@ -736,7 +757,7 @@ void inet_csk_listen_stop(struct sock *sk)
736 * To be honest, we are not able to make either 757 * To be honest, we are not able to make either
737 * of the variants now. --ANK 758 * of the variants now. --ANK
738 */ 759 */
739 reqsk_queue_destroy(&icsk->icsk_accept_queue); 760 reqsk_queue_destroy(queue);
740 761
741 while ((req = acc_req) != NULL) { 762 while ((req = acc_req) != NULL) {
742 struct sock *child = req->sk; 763 struct sock *child = req->sk;
@@ -754,6 +775,19 @@ void inet_csk_listen_stop(struct sock *sk)
754 775
755 percpu_counter_inc(sk->sk_prot->orphan_count); 776 percpu_counter_inc(sk->sk_prot->orphan_count);
756 777
778 if (sk->sk_type == SOCK_STREAM && tcp_rsk(req)->listener) {
779 BUG_ON(tcp_sk(child)->fastopen_rsk != req);
780 BUG_ON(sk != tcp_rsk(req)->listener);
781
782 /* Paranoid, to prevent race condition if
783 * an inbound pkt destined for child is
784 * blocked by sock lock in tcp_v4_rcv().
785 * Also to satisfy an assertion in
786 * tcp_v4_destroy_sock().
787 */
788 tcp_sk(child)->fastopen_rsk = NULL;
789 sock_put(sk);
790 }
757 inet_csk_destroy_sock(child); 791 inet_csk_destroy_sock(child);
758 792
759 bh_unlock_sock(child); 793 bh_unlock_sock(child);
@@ -763,6 +797,17 @@ void inet_csk_listen_stop(struct sock *sk)
763 sk_acceptq_removed(sk); 797 sk_acceptq_removed(sk);
764 __reqsk_free(req); 798 __reqsk_free(req);
765 } 799 }
800 if (queue->fastopenq != NULL) {
801 /* Free all the reqs queued in rskq_rst_head. */
802 spin_lock_bh(&queue->fastopenq->lock);
803 acc_req = queue->fastopenq->rskq_rst_head;
804 queue->fastopenq->rskq_rst_head = NULL;
805 spin_unlock_bh(&queue->fastopenq->lock);
806 while ((req = acc_req) != NULL) {
807 acc_req = req->dl_next;
808 __reqsk_free(req);
809 }
810 }
766 WARN_ON(sk->sk_ack_backlog); 811 WARN_ON(sk->sk_ack_backlog);
767} 812}
768EXPORT_SYMBOL_GPL(inet_csk_listen_stop); 813EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 650e1528e1e6..ba48e799b031 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -319,6 +319,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
319 ireq->tstamp_ok = tcp_opt.saw_tstamp; 319 ireq->tstamp_ok = tcp_opt.saw_tstamp;
320 req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; 320 req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
321 treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; 321 treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
322 treq->listener = NULL;
322 323
323 /* We throwed the options of the initial SYN away, so we hope 324 /* We throwed the options of the initial SYN away, so we hope
324 * the ACK carries the same options again (see RFC1122 4.2.3.8) 325 * the ACK carries the same options again (see RFC1122 4.2.3.8)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2109ff4a1daf..df83d744e380 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -486,8 +486,9 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
486 if (sk->sk_shutdown & RCV_SHUTDOWN) 486 if (sk->sk_shutdown & RCV_SHUTDOWN)
487 mask |= POLLIN | POLLRDNORM | POLLRDHUP; 487 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
488 488
489 /* Connected? */ 489 /* Connected or passive Fast Open socket? */
490 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { 490 if (sk->sk_state != TCP_SYN_SENT &&
491 (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) {
491 int target = sock_rcvlowat(sk, 0, INT_MAX); 492 int target = sock_rcvlowat(sk, 0, INT_MAX);
492 493
493 if (tp->urg_seq == tp->copied_seq && 494 if (tp->urg_seq == tp->copied_seq &&
@@ -840,10 +841,15 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
840 ssize_t copied; 841 ssize_t copied;
841 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 842 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
842 843
843 /* Wait for a connection to finish. */ 844 /* Wait for a connection to finish. One exception is TCP Fast Open
844 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) 845 * (passive side) where data is allowed to be sent before a connection
846 * is fully established.
847 */
848 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
849 !tcp_passive_fastopen(sk)) {
845 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) 850 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
846 goto out_err; 851 goto out_err;
852 }
847 853
848 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 854 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
849 855
@@ -1042,10 +1048,15 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1042 1048
1043 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 1049 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1044 1050
1045 /* Wait for a connection to finish. */ 1051 /* Wait for a connection to finish. One exception is TCP Fast Open
1046 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) 1052 * (passive side) where data is allowed to be sent before a connection
1053 * is fully established.
1054 */
1055 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1056 !tcp_passive_fastopen(sk)) {
1047 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) 1057 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
1048 goto do_error; 1058 goto do_error;
1059 }
1049 1060
1050 if (unlikely(tp->repair)) { 1061 if (unlikely(tp->repair)) {
1051 if (tp->repair_queue == TCP_RECV_QUEUE) { 1062 if (tp->repair_queue == TCP_RECV_QUEUE) {
@@ -2144,6 +2155,10 @@ void tcp_close(struct sock *sk, long timeout)
2144 * they look as CLOSING or LAST_ACK for Linux) 2155 * they look as CLOSING or LAST_ACK for Linux)
2145 * Probably, I missed some more holelets. 2156 * Probably, I missed some more holelets.
2146 * --ANK 2157 * --ANK
2158 * XXX (TFO) - To start off we don't support SYN+ACK+FIN
2159 * in a single packet! (May consider it later but will
2160 * probably need API support or TCP_CORK SYN-ACK until
2161 * data is written and socket is closed.)
2147 */ 2162 */
2148 tcp_send_fin(sk); 2163 tcp_send_fin(sk);
2149 } 2164 }
@@ -2215,8 +2230,16 @@ adjudge_to_death:
2215 } 2230 }
2216 } 2231 }
2217 2232
2218 if (sk->sk_state == TCP_CLOSE) 2233 if (sk->sk_state == TCP_CLOSE) {
2234 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
2235 /* We could get here with a non-NULL req if the socket is
2236 * aborted (e.g., closed with unread data) before 3WHS
2237 * finishes.
2238 */
2239 if (req != NULL)
2240 reqsk_fastopen_remove(sk, req, false);
2219 inet_csk_destroy_sock(sk); 2241 inet_csk_destroy_sock(sk);
2242 }
2220 /* Otherwise, socket is reprieved until protocol close. */ 2243 /* Otherwise, socket is reprieved until protocol close. */
2221 2244
2222out: 2245out:
@@ -2688,6 +2711,14 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2688 else 2711 else
2689 icsk->icsk_user_timeout = msecs_to_jiffies(val); 2712 icsk->icsk_user_timeout = msecs_to_jiffies(val);
2690 break; 2713 break;
2714
2715 case TCP_FASTOPEN:
2716 if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
2717 TCPF_LISTEN)))
2718 err = fastopen_init_queue(sk, val);
2719 else
2720 err = -EINVAL;
2721 break;
2691 default: 2722 default:
2692 err = -ENOPROTOOPT; 2723 err = -ENOPROTOOPT;
2693 break; 2724 break;
@@ -3501,11 +3532,15 @@ EXPORT_SYMBOL(tcp_cookie_generator);
3501 3532
3502void tcp_done(struct sock *sk) 3533void tcp_done(struct sock *sk)
3503{ 3534{
3535 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
3536
3504 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) 3537 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
3505 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); 3538 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
3506 3539
3507 tcp_set_state(sk, TCP_CLOSE); 3540 tcp_set_state(sk, TCP_CLOSE);
3508 tcp_clear_xmit_timers(sk); 3541 tcp_clear_xmit_timers(sk);
3542 if (req != NULL)
3543 reqsk_fastopen_remove(sk, req, false);
3509 3544
3510 sk->sk_shutdown = SHUTDOWN_MASK; 3545 sk->sk_shutdown = SHUTDOWN_MASK;
3511 3546
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 36f02f954ac1..bb148dee1edd 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -839,7 +839,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
839 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 839 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
840 return -1; 840 return -1;
841 841
842 skb = tcp_make_synack(sk, dst, req, rvp); 842 skb = tcp_make_synack(sk, dst, req, rvp, NULL);
843 843
844 if (skb) { 844 if (skb) {
845 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr); 845 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
@@ -1554,7 +1554,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1554 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, 1554 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1555 iph->saddr, iph->daddr); 1555 iph->saddr, iph->daddr);
1556 if (req) 1556 if (req)
1557 return tcp_check_req(sk, skb, req, prev); 1557 return tcp_check_req(sk, skb, req, prev, false);
1558 1558
1559 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, 1559 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1560 th->source, iph->daddr, th->dest, inet_iif(skb)); 1560 th->source, iph->daddr, th->dest, inet_iif(skb));
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 6ff7f10dce9d..e965319d610b 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -507,6 +507,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
507 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; 507 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
508 newtp->rx_opt.mss_clamp = req->mss; 508 newtp->rx_opt.mss_clamp = req->mss;
509 TCP_ECN_openreq_child(newtp, req); 509 TCP_ECN_openreq_child(newtp, req);
510 newtp->fastopen_rsk = NULL;
510 511
511 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); 512 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
512 } 513 }
@@ -515,13 +516,18 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
515EXPORT_SYMBOL(tcp_create_openreq_child); 516EXPORT_SYMBOL(tcp_create_openreq_child);
516 517
517/* 518/*
518 * Process an incoming packet for SYN_RECV sockets represented 519 * Process an incoming packet for SYN_RECV sockets represented as a
519 * as a request_sock. 520 * request_sock. Normally sk is the listener socket but for TFO it
521 * points to the child socket.
522 *
523 * XXX (TFO) - The current impl contains a special check for ack
524 * validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
520 */ 525 */
521 526
522struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, 527struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
523 struct request_sock *req, 528 struct request_sock *req,
524 struct request_sock **prev) 529 struct request_sock **prev,
530 bool fastopen)
525{ 531{
526 struct tcp_options_received tmp_opt; 532 struct tcp_options_received tmp_opt;
527 const u8 *hash_location; 533 const u8 *hash_location;
@@ -530,6 +536,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
530 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); 536 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
531 bool paws_reject = false; 537 bool paws_reject = false;
532 538
539 BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));
540
533 tmp_opt.saw_tstamp = 0; 541 tmp_opt.saw_tstamp = 0;
534 if (th->doff > (sizeof(struct tcphdr)>>2)) { 542 if (th->doff > (sizeof(struct tcphdr)>>2)) {
535 tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); 543 tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
@@ -565,6 +573,9 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
565 * 573 *
566 * Enforce "SYN-ACK" according to figure 8, figure 6 574 * Enforce "SYN-ACK" according to figure 8, figure 6
567 * of RFC793, fixed by RFC1122. 575 * of RFC793, fixed by RFC1122.
576 *
577 * Note that even if there is new data in the SYN packet
578 * they will be thrown away too.
568 */ 579 */
569 req->rsk_ops->rtx_syn_ack(sk, req, NULL); 580 req->rsk_ops->rtx_syn_ack(sk, req, NULL);
570 return NULL; 581 return NULL;
@@ -622,9 +633,12 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
622 * sent (the segment carries an unacceptable ACK) ... 633 * sent (the segment carries an unacceptable ACK) ...
623 * a reset is sent." 634 * a reset is sent."
624 * 635 *
625 * Invalid ACK: reset will be sent by listening socket 636 * Invalid ACK: reset will be sent by listening socket.
637 * Note that the ACK validity check for a Fast Open socket is done
638 * elsewhere and is checked directly against the child socket rather
639 * than req because user data may have been sent out.
626 */ 640 */
627 if ((flg & TCP_FLAG_ACK) && 641 if ((flg & TCP_FLAG_ACK) && !fastopen &&
628 (TCP_SKB_CB(skb)->ack_seq != 642 (TCP_SKB_CB(skb)->ack_seq !=
629 tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk)))) 643 tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk))))
630 return sk; 644 return sk;
@@ -637,7 +651,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
637 /* RFC793: "first check sequence number". */ 651 /* RFC793: "first check sequence number". */
638 652
639 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, 653 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
640 tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) { 654 tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) {
641 /* Out of window: send ACK and drop. */ 655 /* Out of window: send ACK and drop. */
642 if (!(flg & TCP_FLAG_RST)) 656 if (!(flg & TCP_FLAG_RST))
643 req->rsk_ops->send_ack(sk, skb, req); 657 req->rsk_ops->send_ack(sk, skb, req);
@@ -648,7 +662,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
648 662
649 /* In sequence, PAWS is OK. */ 663 /* In sequence, PAWS is OK. */
650 664
651 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1)) 665 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
652 req->ts_recent = tmp_opt.rcv_tsval; 666 req->ts_recent = tmp_opt.rcv_tsval;
653 667
654 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { 668 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
@@ -667,10 +681,19 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
667 681
668 /* ACK sequence verified above, just make sure ACK is 682 /* ACK sequence verified above, just make sure ACK is
669 * set. If ACK not set, just silently drop the packet. 683 * set. If ACK not set, just silently drop the packet.
684 *
685 * XXX (TFO) - if we ever allow "data after SYN", the
686 * following check needs to be removed.
670 */ 687 */
671 if (!(flg & TCP_FLAG_ACK)) 688 if (!(flg & TCP_FLAG_ACK))
672 return NULL; 689 return NULL;
673 690
691 /* For Fast Open no more processing is needed (sk is the
692 * child socket).
693 */
694 if (fastopen)
695 return sk;
696
674 /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ 697 /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
675 if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && 698 if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
676 TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { 699 TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
@@ -706,11 +729,21 @@ listen_overflow:
706 } 729 }
707 730
708embryonic_reset: 731embryonic_reset:
709 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); 732 if (!(flg & TCP_FLAG_RST)) {
710 if (!(flg & TCP_FLAG_RST)) 733 /* Received a bad SYN pkt - for TFO We try not to reset
734 * the local connection unless it's really necessary to
735 * avoid becoming vulnerable to outside attack aiming at
736 * resetting legit local connections.
737 */
711 req->rsk_ops->send_reset(sk, skb); 738 req->rsk_ops->send_reset(sk, skb);
712 739 } else if (fastopen) { /* received a valid RST pkt */
713 inet_csk_reqsk_queue_drop(sk, req, prev); 740 reqsk_fastopen_remove(sk, req, true);
741 tcp_reset(sk);
742 }
743 if (!fastopen) {
744 inet_csk_reqsk_queue_drop(sk, req, prev);
745 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
746 }
714 return NULL; 747 return NULL;
715} 748}
716EXPORT_SYMBOL(tcp_check_req); 749EXPORT_SYMBOL(tcp_check_req);
@@ -719,6 +752,12 @@ EXPORT_SYMBOL(tcp_check_req);
719 * Queue segment on the new socket if the new socket is active, 752 * Queue segment on the new socket if the new socket is active,
720 * otherwise we just shortcircuit this and continue with 753 * otherwise we just shortcircuit this and continue with
721 * the new socket. 754 * the new socket.
755 *
756 * For the vast majority of cases child->sk_state will be TCP_SYN_RECV
757 * when entering. But other states are possible due to a race condition
758 * where after __inet_lookup_established() fails but before the listener
759 * locked is obtained, other packets cause the same connection to
760 * be created.
722 */ 761 */
723 762
724int tcp_child_process(struct sock *parent, struct sock *child, 763int tcp_child_process(struct sock *parent, struct sock *child,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index d04632673a9e..9383b51f3efc 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -702,7 +702,8 @@ static unsigned int tcp_synack_options(struct sock *sk,
702 unsigned int mss, struct sk_buff *skb, 702 unsigned int mss, struct sk_buff *skb,
703 struct tcp_out_options *opts, 703 struct tcp_out_options *opts,
704 struct tcp_md5sig_key **md5, 704 struct tcp_md5sig_key **md5,
705 struct tcp_extend_values *xvp) 705 struct tcp_extend_values *xvp,
706 struct tcp_fastopen_cookie *foc)
706{ 707{
707 struct inet_request_sock *ireq = inet_rsk(req); 708 struct inet_request_sock *ireq = inet_rsk(req);
708 unsigned int remaining = MAX_TCP_OPTION_SPACE; 709 unsigned int remaining = MAX_TCP_OPTION_SPACE;
@@ -747,7 +748,15 @@ static unsigned int tcp_synack_options(struct sock *sk,
747 if (unlikely(!ireq->tstamp_ok)) 748 if (unlikely(!ireq->tstamp_ok))
748 remaining -= TCPOLEN_SACKPERM_ALIGNED; 749 remaining -= TCPOLEN_SACKPERM_ALIGNED;
749 } 750 }
750 751 if (foc != NULL) {
752 u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
753 need = (need + 3) & ~3U; /* Align to 32 bits */
754 if (remaining >= need) {
755 opts->options |= OPTION_FAST_OPEN_COOKIE;
756 opts->fastopen_cookie = foc;
757 remaining -= need;
758 }
759 }
751 /* Similar rationale to tcp_syn_options() applies here, too. 760 /* Similar rationale to tcp_syn_options() applies here, too.
752 * If the <SYN> options fit, the same options should fit now! 761 * If the <SYN> options fit, the same options should fit now!
753 */ 762 */
@@ -2658,7 +2667,8 @@ int tcp_send_synack(struct sock *sk)
2658 */ 2667 */
2659struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, 2668struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2660 struct request_sock *req, 2669 struct request_sock *req,
2661 struct request_values *rvp) 2670 struct request_values *rvp,
2671 struct tcp_fastopen_cookie *foc)
2662{ 2672{
2663 struct tcp_out_options opts; 2673 struct tcp_out_options opts;
2664 struct tcp_extend_values *xvp = tcp_xv(rvp); 2674 struct tcp_extend_values *xvp = tcp_xv(rvp);
@@ -2718,7 +2728,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2718#endif 2728#endif
2719 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2729 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2720 tcp_header_size = tcp_synack_options(sk, req, mss, 2730 tcp_header_size = tcp_synack_options(sk, req, mss,
2721 skb, &opts, &md5, xvp) 2731 skb, &opts, &md5, xvp, foc)
2722 + sizeof(*th); 2732 + sizeof(*th);
2723 2733
2724 skb_push(skb, tcp_header_size); 2734 skb_push(skb, tcp_header_size);
@@ -2772,7 +2782,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2772 } 2782 }
2773 2783
2774 th->seq = htonl(TCP_SKB_CB(skb)->seq); 2784 th->seq = htonl(TCP_SKB_CB(skb)->seq);
2775 th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); 2785 /* XXX data is queued and acked as is. No buffer/window check */
2786 th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
2776 2787
2777 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ 2788 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
2778 th->window = htons(min(req->rcv_wnd, 65535U)); 2789 th->window = htons(min(req->rcv_wnd, 65535U));
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b774a03bd1dc..fc04711e80c8 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -305,6 +305,35 @@ static void tcp_probe_timer(struct sock *sk)
305} 305}
306 306
307/* 307/*
308 * Timer for Fast Open socket to retransmit SYNACK. Note that the
309 * sk here is the child socket, not the parent (listener) socket.
310 */
311static void tcp_fastopen_synack_timer(struct sock *sk)
312{
313 struct inet_connection_sock *icsk = inet_csk(sk);
314 int max_retries = icsk->icsk_syn_retries ? :
315 sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
316 struct request_sock *req;
317
318 req = tcp_sk(sk)->fastopen_rsk;
319 req->rsk_ops->syn_ack_timeout(sk, req);
320
321 if (req->retrans >= max_retries) {
322 tcp_write_err(sk);
323 return;
324 }
325 /* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error
326 * returned from rtx_syn_ack() to make it more persistent like
327 * regular retransmit because if the child socket has been accepted
328 * it's not good to give up too easily.
329 */
330 req->rsk_ops->rtx_syn_ack(sk, req, NULL);
331 req->retrans++;
332 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
333 TCP_TIMEOUT_INIT << req->retrans, TCP_RTO_MAX);
334}
335
336/*
308 * The TCP retransmit timer. 337 * The TCP retransmit timer.
309 */ 338 */
310 339
@@ -317,7 +346,15 @@ void tcp_retransmit_timer(struct sock *sk)
317 tcp_resume_early_retransmit(sk); 346 tcp_resume_early_retransmit(sk);
318 return; 347 return;
319 } 348 }
320 349 if (tp->fastopen_rsk) {
350 BUG_ON(sk->sk_state != TCP_SYN_RECV &&
351 sk->sk_state != TCP_FIN_WAIT1);
352 tcp_fastopen_synack_timer(sk);
353 /* Before we receive ACK to our SYN-ACK don't retransmit
354 * anything else (e.g., data or FIN segments).
355 */
356 return;
357 }
321 if (!tp->packets_out) 358 if (!tp->packets_out)
322 goto out; 359 goto out;
323 360
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index bb46061c813a..182ab9a85d6c 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -190,6 +190,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
190 ireq = inet_rsk(req); 190 ireq = inet_rsk(req);
191 ireq6 = inet6_rsk(req); 191 ireq6 = inet6_rsk(req);
192 treq = tcp_rsk(req); 192 treq = tcp_rsk(req);
193 treq->listener = NULL;
193 194
194 if (security_inet_conn_request(sk, skb, req)) 195 if (security_inet_conn_request(sk, skb, req))
195 goto out_free; 196 goto out_free;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index f99b81d53cca..09078b9bc6f6 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -475,7 +475,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
475 if (!dst && (dst = inet6_csk_route_req(sk, fl6, req)) == NULL) 475 if (!dst && (dst = inet6_csk_route_req(sk, fl6, req)) == NULL)
476 goto done; 476 goto done;
477 477
478 skb = tcp_make_synack(sk, dst, req, rvp); 478 skb = tcp_make_synack(sk, dst, req, rvp, NULL);
479 479
480 if (skb) { 480 if (skb) {
481 __tcp_v6_send_check(skb, &treq->loc_addr, &treq->rmt_addr); 481 __tcp_v6_send_check(skb, &treq->loc_addr, &treq->rmt_addr);
@@ -987,7 +987,7 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
987 &ipv6_hdr(skb)->saddr, 987 &ipv6_hdr(skb)->saddr,
988 &ipv6_hdr(skb)->daddr, inet6_iif(skb)); 988 &ipv6_hdr(skb)->daddr, inet6_iif(skb));
989 if (req) 989 if (req)
990 return tcp_check_req(sk, skb, req, prev); 990 return tcp_check_req(sk, skb, req, prev, false);
991 991
992 nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo, 992 nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo,
993 &ipv6_hdr(skb)->saddr, th->source, 993 &ipv6_hdr(skb)->saddr, th->source,
@@ -1179,6 +1179,7 @@ have_isn:
1179 want_cookie) 1179 want_cookie)
1180 goto drop_and_free; 1180 goto drop_and_free;
1181 1181
1182 tcp_rsk(req)->listener = NULL;
1182 inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); 1183 inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1183 return 0; 1184 return 0;
1184 1185