aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2012-08-31 20:43:37 -0400
committerDavid S. Miller <davem@davemloft.net>2012-08-31 20:43:37 -0400
commit1bed966cc3bd4042110129f0fc51aeeb59c5b200 (patch)
tree0d5b9181b840c9b6b08b1452004f0746e8eebab8
parent2a35cfa591ac63f17815c2d9432b799e37527980 (diff)
parent168a8f58059a22feb9e9a2dcc1b8053dbbbc12ef (diff)
Merge branch 'tcp_fastopen_server'
Jerry Chu says: ==================== This patch series provides the server (passive open) side code for TCP Fast Open. Together with the earlier client side patches it completes the TCP Fast Open implementation. The server side Fast Open code accepts data carried in the SYN packet with a valid Fast Open cookie, and passes it to the application right away, allowing application to send back response data, all before TCP's 3-way handshake finishes. A simple cookie scheme together with capping the number of outstanding TFO requests (still in TCP_SYN_RECV state) to a limit per listener forms the main line of defense against spoofed SYN attacks. For more details about TCP Fast Open see our IETF internet draft at http://www.ietf.org/id/draft-ietf-tcpm-fastopen-01.txt and a research paper at http://conferences.sigcomm.org/co-next/2011/papers/1569470463.pdf A prototype implementation was first developed by Sivasankar Radhakrishnan (sivasankar@cs.ucsd.edu). A patch based on an older version of Linux kernel has been undergoing internal tests at Google for the past few months. Jerry Chu (3): tcp: TCP Fast Open Server - header & support functions tcp: TCP Fast Open Server - support TFO listeners tcp: TCP Fast Open Server - main code path ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--Documentation/networking/ip-sysctl.txt29
-rw-r--r--include/linux/snmp.h4
-rw-r--r--include/linux/tcp.h45
-rw-r--r--include/net/request_sock.h49
-rw-r--r--include/net/tcp.h52
-rw-r--r--net/core/request_sock.c95
-rw-r--r--net/ipv4/af_inet.c28
-rw-r--r--net/ipv4/inet_connection_sock.c57
-rw-r--r--net/ipv4/proc.c4
-rw-r--r--net/ipv4/syncookies.c1
-rw-r--r--net/ipv4/sysctl_net_ipv4.c45
-rw-r--r--net/ipv4/tcp.c49
-rw-r--r--net/ipv4/tcp_fastopen.c83
-rw-r--r--net/ipv4/tcp_input.c75
-rw-r--r--net/ipv4/tcp_ipv4.c269
-rw-r--r--net/ipv4/tcp_minisocks.c61
-rw-r--r--net/ipv4/tcp_output.c21
-rw-r--r--net/ipv4/tcp_timer.c39
-rw-r--r--net/ipv6/syncookies.c1
-rw-r--r--net/ipv6/tcp_ipv6.c5
20 files changed, 915 insertions, 97 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index d64e53124b8c..c7fc10724948 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -467,16 +467,31 @@ tcp_syncookies - BOOLEAN
467tcp_fastopen - INTEGER 467tcp_fastopen - INTEGER
468 Enable TCP Fast Open feature (draft-ietf-tcpm-fastopen) to send data 468 Enable TCP Fast Open feature (draft-ietf-tcpm-fastopen) to send data
469 in the opening SYN packet. To use this feature, the client application 469 in the opening SYN packet. To use this feature, the client application
470 must not use connect(). Instead, it should use sendmsg() or sendto() 470 must use sendmsg() or sendto() with MSG_FASTOPEN flag rather than
471 with MSG_FASTOPEN flag which performs a TCP handshake automatically. 471 connect() to perform a TCP handshake automatically.
472 472
473 The values (bitmap) are: 473 The values (bitmap) are
474 1: Enables sending data in the opening SYN on the client 474 1: Enables sending data in the opening SYN on the client.
475 5: Enables sending data in the opening SYN on the client regardless 475 2: Enables TCP Fast Open on the server side, i.e., allowing data in
476 of cookie availability. 476 a SYN packet to be accepted and passed to the application before
477 3-way hand shake finishes.
478 4: Send data in the opening SYN regardless of cookie availability and
479 without a cookie option.
480 0x100: Accept SYN data w/o validating the cookie.
481 0x200: Accept data-in-SYN w/o any cookie option present.
482 0x400/0x800: Enable Fast Open on all listeners regardless of the
483 TCP_FASTOPEN socket option. The two different flags designate two
484 different ways of setting max_qlen without the TCP_FASTOPEN socket
485 option.
477 486
478 Default: 0 487 Default: 0
479 488
489 Note that the client & server side Fast Open flags (1 and 2
490 respectively) must be also enabled before the rest of flags can take
491 effect.
492
493 See include/net/tcp.h and the code for more details.
494
480tcp_syn_retries - INTEGER 495tcp_syn_retries - INTEGER
481 Number of times initial SYNs for an active TCP connection attempt 496 Number of times initial SYNs for an active TCP connection attempt
482 will be retransmitted. Should not be higher than 255. Default value 497 will be retransmitted. Should not be higher than 255. Default value
diff --git a/include/linux/snmp.h b/include/linux/snmp.h
index ad6e3a6bf9fb..fdfba235f9f1 100644
--- a/include/linux/snmp.h
+++ b/include/linux/snmp.h
@@ -241,6 +241,10 @@ enum
241 LINUX_MIB_TCPCHALLENGEACK, /* TCPChallengeACK */ 241 LINUX_MIB_TCPCHALLENGEACK, /* TCPChallengeACK */
242 LINUX_MIB_TCPSYNCHALLENGE, /* TCPSYNChallenge */ 242 LINUX_MIB_TCPSYNCHALLENGE, /* TCPSYNChallenge */
243 LINUX_MIB_TCPFASTOPENACTIVE, /* TCPFastOpenActive */ 243 LINUX_MIB_TCPFASTOPENACTIVE, /* TCPFastOpenActive */
244 LINUX_MIB_TCPFASTOPENPASSIVE, /* TCPFastOpenPassive*/
245 LINUX_MIB_TCPFASTOPENPASSIVEFAIL, /* TCPFastOpenPassiveFail */
246 LINUX_MIB_TCPFASTOPENLISTENOVERFLOW, /* TCPFastOpenListenOverflow */
247 LINUX_MIB_TCPFASTOPENCOOKIEREQD, /* TCPFastOpenCookieReqd */
244 __LINUX_MIB_MAX 248 __LINUX_MIB_MAX
245}; 249};
246 250
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index eb125a4c30b3..ae46df590629 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -110,6 +110,7 @@ enum {
110#define TCP_REPAIR_QUEUE 20 110#define TCP_REPAIR_QUEUE 20
111#define TCP_QUEUE_SEQ 21 111#define TCP_QUEUE_SEQ 21
112#define TCP_REPAIR_OPTIONS 22 112#define TCP_REPAIR_OPTIONS 22
113#define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */
113 114
114struct tcp_repair_opt { 115struct tcp_repair_opt {
115 __u32 opt_code; 116 __u32 opt_code;
@@ -246,6 +247,7 @@ static inline unsigned int tcp_optlen(const struct sk_buff *skb)
246/* TCP Fast Open */ 247/* TCP Fast Open */
247#define TCP_FASTOPEN_COOKIE_MIN 4 /* Min Fast Open Cookie size in bytes */ 248#define TCP_FASTOPEN_COOKIE_MIN 4 /* Min Fast Open Cookie size in bytes */
248#define TCP_FASTOPEN_COOKIE_MAX 16 /* Max Fast Open Cookie size in bytes */ 249#define TCP_FASTOPEN_COOKIE_MAX 16 /* Max Fast Open Cookie size in bytes */
250#define TCP_FASTOPEN_COOKIE_SIZE 8 /* the size employed by this impl. */
249 251
250/* TCP Fast Open Cookie as stored in memory */ 252/* TCP Fast Open Cookie as stored in memory */
251struct tcp_fastopen_cookie { 253struct tcp_fastopen_cookie {
@@ -312,9 +314,14 @@ struct tcp_request_sock {
312 /* Only used by TCP MD5 Signature so far. */ 314 /* Only used by TCP MD5 Signature so far. */
313 const struct tcp_request_sock_ops *af_specific; 315 const struct tcp_request_sock_ops *af_specific;
314#endif 316#endif
317 struct sock *listener; /* needed for TFO */
315 u32 rcv_isn; 318 u32 rcv_isn;
316 u32 snt_isn; 319 u32 snt_isn;
317 u32 snt_synack; /* synack sent time */ 320 u32 snt_synack; /* synack sent time */
321 u32 rcv_nxt; /* the ack # by SYNACK. For
322 * FastOpen it's the seq#
323 * after data-in-SYN.
324 */
318}; 325};
319 326
320static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req) 327static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
@@ -505,14 +512,18 @@ struct tcp_sock {
505 struct tcp_md5sig_info __rcu *md5sig_info; 512 struct tcp_md5sig_info __rcu *md5sig_info;
506#endif 513#endif
507 514
508/* TCP fastopen related information */
509 struct tcp_fastopen_request *fastopen_req;
510
511 /* When the cookie options are generated and exchanged, then this 515 /* When the cookie options are generated and exchanged, then this
512 * object holds a reference to them (cookie_values->kref). Also 516 * object holds a reference to them (cookie_values->kref). Also
513 * contains related tcp_cookie_transactions fields. 517 * contains related tcp_cookie_transactions fields.
514 */ 518 */
515 struct tcp_cookie_values *cookie_values; 519 struct tcp_cookie_values *cookie_values;
520
521/* TCP fastopen related information */
522 struct tcp_fastopen_request *fastopen_req;
523 /* fastopen_rsk points to request_sock that resulted in this big
524 * socket. Used to retransmit SYNACKs etc.
525 */
526 struct request_sock *fastopen_rsk;
516}; 527};
517 528
518enum tsq_flags { 529enum tsq_flags {
@@ -552,6 +563,34 @@ static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
552 return (struct tcp_timewait_sock *)sk; 563 return (struct tcp_timewait_sock *)sk;
553} 564}
554 565
566static inline bool tcp_passive_fastopen(const struct sock *sk)
567{
568 return (sk->sk_state == TCP_SYN_RECV &&
569 tcp_sk(sk)->fastopen_rsk != NULL);
570}
571
572static inline bool fastopen_cookie_present(struct tcp_fastopen_cookie *foc)
573{
574 return foc->len != -1;
575}
576
577static inline int fastopen_init_queue(struct sock *sk, int backlog)
578{
579 struct request_sock_queue *queue =
580 &inet_csk(sk)->icsk_accept_queue;
581
582 if (queue->fastopenq == NULL) {
583 queue->fastopenq = kzalloc(
584 sizeof(struct fastopen_queue),
585 sk->sk_allocation);
586 if (queue->fastopenq == NULL)
587 return -ENOMEM;
588 spin_lock_init(&queue->fastopenq->lock);
589 }
590 queue->fastopenq->max_qlen = backlog;
591 return 0;
592}
593
555#endif /* __KERNEL__ */ 594#endif /* __KERNEL__ */
556 595
557#endif /* _LINUX_TCP_H */ 596#endif /* _LINUX_TCP_H */
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 4c0766e201e3..b01d8dd9ee7c 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -106,6 +106,34 @@ struct listen_sock {
106 struct request_sock *syn_table[0]; 106 struct request_sock *syn_table[0];
107}; 107};
108 108
109/*
110 * For a TCP Fast Open listener -
111 * lock - protects the access to all the reqsk, which is co-owned by
112 * the listener and the child socket.
113 * qlen - pending TFO requests (still in TCP_SYN_RECV).
114 * max_qlen - max TFO reqs allowed before TFO is disabled.
115 *
116 * XXX (TFO) - ideally these fields can be made as part of "listen_sock"
117 * structure above. But there is some implementation difficulty due to
118 * listen_sock being part of request_sock_queue hence will be freed when
119 * a listener is stopped. But TFO related fields may continue to be
120 * accessed even after a listener is closed, until its sk_refcnt drops
121 * to 0 implying no more outstanding TFO reqs. One solution is to keep
122 * listen_opt around until sk_refcnt drops to 0. But there is some other
123 * complexity that needs to be resolved. E.g., a listener can be disabled
124 * temporarily through shutdown()->tcp_disconnect(), and re-enabled later.
125 */
126struct fastopen_queue {
127 struct request_sock *rskq_rst_head; /* Keep track of past TFO */
128 struct request_sock *rskq_rst_tail; /* requests that caused RST.
129 * This is part of the defense
130 * against spoofing attack.
131 */
132 spinlock_t lock;
133 int qlen; /* # of pending (TCP_SYN_RECV) reqs */
134 int max_qlen; /* != 0 iff TFO is currently enabled */
135};
136
109/** struct request_sock_queue - queue of request_socks 137/** struct request_sock_queue - queue of request_socks
110 * 138 *
111 * @rskq_accept_head - FIFO head of established children 139 * @rskq_accept_head - FIFO head of established children
@@ -129,6 +157,12 @@ struct request_sock_queue {
129 u8 rskq_defer_accept; 157 u8 rskq_defer_accept;
130 /* 3 bytes hole, try to pack */ 158 /* 3 bytes hole, try to pack */
131 struct listen_sock *listen_opt; 159 struct listen_sock *listen_opt;
160 struct fastopen_queue *fastopenq; /* This is non-NULL iff TFO has been
161 * enabled on this listener. Check
162 * max_qlen != 0 in fastopen_queue
163 * to determine if TFO is enabled
164 * right at this moment.
165 */
132}; 166};
133 167
134extern int reqsk_queue_alloc(struct request_sock_queue *queue, 168extern int reqsk_queue_alloc(struct request_sock_queue *queue,
@@ -136,6 +170,8 @@ extern int reqsk_queue_alloc(struct request_sock_queue *queue,
136 170
137extern void __reqsk_queue_destroy(struct request_sock_queue *queue); 171extern void __reqsk_queue_destroy(struct request_sock_queue *queue);
138extern void reqsk_queue_destroy(struct request_sock_queue *queue); 172extern void reqsk_queue_destroy(struct request_sock_queue *queue);
173extern void reqsk_fastopen_remove(struct sock *sk,
174 struct request_sock *req, bool reset);
139 175
140static inline struct request_sock * 176static inline struct request_sock *
141 reqsk_queue_yank_acceptq(struct request_sock_queue *queue) 177 reqsk_queue_yank_acceptq(struct request_sock_queue *queue)
@@ -190,19 +226,6 @@ static inline struct request_sock *reqsk_queue_remove(struct request_sock_queue
190 return req; 226 return req;
191} 227}
192 228
193static inline struct sock *reqsk_queue_get_child(struct request_sock_queue *queue,
194 struct sock *parent)
195{
196 struct request_sock *req = reqsk_queue_remove(queue);
197 struct sock *child = req->sk;
198
199 WARN_ON(child == NULL);
200
201 sk_acceptq_removed(parent);
202 __reqsk_free(req);
203 return child;
204}
205
206static inline int reqsk_queue_removed(struct request_sock_queue *queue, 229static inline int reqsk_queue_removed(struct request_sock_queue *queue,
207 struct request_sock *req) 230 struct request_sock *req)
208{ 231{
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0fca06f16463..1421b02a7905 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -224,8 +224,24 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
224 224
225/* Bit Flags for sysctl_tcp_fastopen */ 225/* Bit Flags for sysctl_tcp_fastopen */
226#define TFO_CLIENT_ENABLE 1 226#define TFO_CLIENT_ENABLE 1
227#define TFO_SERVER_ENABLE 2
227#define TFO_CLIENT_NO_COOKIE 4 /* Data in SYN w/o cookie option */ 228#define TFO_CLIENT_NO_COOKIE 4 /* Data in SYN w/o cookie option */
228 229
230/* Process SYN data but skip cookie validation */
231#define TFO_SERVER_COOKIE_NOT_CHKED 0x100
232/* Accept SYN data w/o any cookie option */
233#define TFO_SERVER_COOKIE_NOT_REQD 0x200
234
235/* Force enable TFO on all listeners, i.e., not requiring the
236 * TCP_FASTOPEN socket option. SOCKOPT1/2 determine how to set max_qlen.
237 */
238#define TFO_SERVER_WO_SOCKOPT1 0x400
239#define TFO_SERVER_WO_SOCKOPT2 0x800
240/* Always create TFO child sockets on a TFO listener even when
241 * cookie/data not present. (For testing purpose!)
242 */
243#define TFO_SERVER_ALWAYS 0x1000
244
229extern struct inet_timewait_death_row tcp_death_row; 245extern struct inet_timewait_death_row tcp_death_row;
230 246
231/* sysctl variables for tcp */ 247/* sysctl variables for tcp */
@@ -408,7 +424,8 @@ extern enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *
408 const struct tcphdr *th); 424 const struct tcphdr *th);
409extern struct sock * tcp_check_req(struct sock *sk,struct sk_buff *skb, 425extern struct sock * tcp_check_req(struct sock *sk,struct sk_buff *skb,
410 struct request_sock *req, 426 struct request_sock *req,
411 struct request_sock **prev); 427 struct request_sock **prev,
428 bool fastopen);
412extern int tcp_child_process(struct sock *parent, struct sock *child, 429extern int tcp_child_process(struct sock *parent, struct sock *child,
413 struct sk_buff *skb); 430 struct sk_buff *skb);
414extern bool tcp_use_frto(struct sock *sk); 431extern bool tcp_use_frto(struct sock *sk);
@@ -421,12 +438,6 @@ extern void tcp_metrics_init(void);
421extern bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check); 438extern bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check);
422extern bool tcp_remember_stamp(struct sock *sk); 439extern bool tcp_remember_stamp(struct sock *sk);
423extern bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw); 440extern bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw);
424extern void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
425 struct tcp_fastopen_cookie *cookie,
426 int *syn_loss, unsigned long *last_syn_loss);
427extern void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
428 struct tcp_fastopen_cookie *cookie,
429 bool syn_lost);
430extern void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst); 441extern void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst);
431extern void tcp_disable_fack(struct tcp_sock *tp); 442extern void tcp_disable_fack(struct tcp_sock *tp);
432extern void tcp_close(struct sock *sk, long timeout); 443extern void tcp_close(struct sock *sk, long timeout);
@@ -468,7 +479,8 @@ extern int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
468extern int tcp_connect(struct sock *sk); 479extern int tcp_connect(struct sock *sk);
469extern struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, 480extern struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
470 struct request_sock *req, 481 struct request_sock *req,
471 struct request_values *rvp); 482 struct request_values *rvp,
483 struct tcp_fastopen_cookie *foc);
472extern int tcp_disconnect(struct sock *sk, int flags); 484extern int tcp_disconnect(struct sock *sk, int flags);
473 485
474void tcp_connect_init(struct sock *sk); 486void tcp_connect_init(struct sock *sk);
@@ -537,6 +549,7 @@ extern void tcp_send_delayed_ack(struct sock *sk);
537extern void tcp_cwnd_application_limited(struct sock *sk); 549extern void tcp_cwnd_application_limited(struct sock *sk);
538extern void tcp_resume_early_retransmit(struct sock *sk); 550extern void tcp_resume_early_retransmit(struct sock *sk);
539extern void tcp_rearm_rto(struct sock *sk); 551extern void tcp_rearm_rto(struct sock *sk);
552extern void tcp_reset(struct sock *sk);
540 553
541/* tcp_timer.c */ 554/* tcp_timer.c */
542extern void tcp_init_xmit_timers(struct sock *); 555extern void tcp_init_xmit_timers(struct sock *);
@@ -586,6 +599,7 @@ extern int tcp_mtu_to_mss(struct sock *sk, int pmtu);
586extern int tcp_mss_to_mtu(struct sock *sk, int mss); 599extern int tcp_mss_to_mtu(struct sock *sk, int mss);
587extern void tcp_mtup_init(struct sock *sk); 600extern void tcp_mtup_init(struct sock *sk);
588extern void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt); 601extern void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt);
602extern void tcp_init_buffer_space(struct sock *sk);
589 603
590static inline void tcp_bound_rto(const struct sock *sk) 604static inline void tcp_bound_rto(const struct sock *sk)
591{ 605{
@@ -1104,6 +1118,7 @@ static inline void tcp_openreq_init(struct request_sock *req,
1104 req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */ 1118 req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
1105 req->cookie_ts = 0; 1119 req->cookie_ts = 0;
1106 tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; 1120 tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
1121 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
1107 req->mss = rx_opt->mss_clamp; 1122 req->mss = rx_opt->mss_clamp;
1108 req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; 1123 req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
1109 ireq->tstamp_ok = rx_opt->tstamp_ok; 1124 ireq->tstamp_ok = rx_opt->tstamp_ok;
@@ -1308,15 +1323,34 @@ extern int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *, const struct sk_buff
1308extern int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, 1323extern int tcp_md5_hash_key(struct tcp_md5sig_pool *hp,
1309 const struct tcp_md5sig_key *key); 1324 const struct tcp_md5sig_key *key);
1310 1325
1326/* From tcp_fastopen.c */
1327extern void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
1328 struct tcp_fastopen_cookie *cookie,
1329 int *syn_loss, unsigned long *last_syn_loss);
1330extern void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
1331 struct tcp_fastopen_cookie *cookie,
1332 bool syn_lost);
1311struct tcp_fastopen_request { 1333struct tcp_fastopen_request {
1312 /* Fast Open cookie. Size 0 means a cookie request */ 1334 /* Fast Open cookie. Size 0 means a cookie request */
1313 struct tcp_fastopen_cookie cookie; 1335 struct tcp_fastopen_cookie cookie;
1314 struct msghdr *data; /* data in MSG_FASTOPEN */ 1336 struct msghdr *data; /* data in MSG_FASTOPEN */
1315 u16 copied; /* queued in tcp_connect() */ 1337 u16 copied; /* queued in tcp_connect() */
1316}; 1338};
1317
1318void tcp_free_fastopen_req(struct tcp_sock *tp); 1339void tcp_free_fastopen_req(struct tcp_sock *tp);
1319 1340
1341extern struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
1342int tcp_fastopen_reset_cipher(void *key, unsigned int len);
1343void tcp_fastopen_cookie_gen(__be32 addr, struct tcp_fastopen_cookie *foc);
1344
1345#define TCP_FASTOPEN_KEY_LENGTH 16
1346
1347/* Fastopen key context */
1348struct tcp_fastopen_context {
1349 struct crypto_cipher __rcu *tfm;
1350 __u8 key[TCP_FASTOPEN_KEY_LENGTH];
1351 struct rcu_head rcu;
1352};
1353
1320/* write queue abstraction */ 1354/* write queue abstraction */
1321static inline void tcp_write_queue_purge(struct sock *sk) 1355static inline void tcp_write_queue_purge(struct sock *sk)
1322{ 1356{
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 9b570a6a33c5..c31d9e8668c3 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -15,6 +15,7 @@
15#include <linux/random.h> 15#include <linux/random.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/string.h> 17#include <linux/string.h>
18#include <linux/tcp.h>
18#include <linux/vmalloc.h> 19#include <linux/vmalloc.h>
19 20
20#include <net/request_sock.h> 21#include <net/request_sock.h>
@@ -130,3 +131,97 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
130 kfree(lopt); 131 kfree(lopt);
131} 132}
132 133
134/*
135 * This function is called to set a Fast Open socket's "fastopen_rsk" field
136 * to NULL when a TFO socket no longer needs to access the request_sock.
137 * This happens only after 3WHS has been either completed or aborted (e.g.,
138 * RST is received).
139 *
140 * Before TFO, a child socket is created only after 3WHS is completed,
141 * hence it never needs to access the request_sock. things get a lot more
142 * complex with TFO. A child socket, accepted or not, has to access its
143 * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts,
144 * until 3WHS is either completed or aborted. Afterwards the req will stay
145 * until either the child socket is accepted, or in the rare case when the
146 * listener is closed before the child is accepted.
147 *
148 * In short, a request socket is only freed after BOTH 3WHS has completed
149 * (or aborted) and the child socket has been accepted (or listener closed).
150 * When a child socket is accepted, its corresponding req->sk is set to
151 * NULL since it's no longer needed. More importantly, "req->sk == NULL"
152 * will be used by the code below to determine if a child socket has been
153 * accepted or not, and the check is protected by the fastopenq->lock
154 * described below.
155 *
156 * Note that fastopen_rsk is only accessed from the child socket's context
157 * with its socket lock held. But a request_sock (req) can be accessed by
158 * both its child socket through fastopen_rsk, and a listener socket through
159 * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin
160 * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created.
161 * only in the rare case when both the listener and the child locks are held,
162 * e.g., in inet_csk_listen_stop() do we not need to acquire the lock.
163 * The lock also protects other fields such as fastopenq->qlen, which is
164 * decremented by this function when fastopen_rsk is no longer needed.
165 *
166 * Note that another solution was to simply use the existing socket lock
167 * from the listener. But first socket lock is difficult to use. It is not
168 * a simple spin lock - one must consider sock_owned_by_user() and arrange
169 * to use sk_add_backlog() stuff. But what really makes it infeasible is the
170 * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to
171 * acquire a child's lock while holding listener's socket lock. A corner
172 * case might also exist in tcp_v4_hnd_req() that will trigger this locking
173 * order.
174 *
175 * When a TFO req is created, it needs to sock_hold its listener to prevent
176 * the latter data structure from going away.
177 *
178 * This function also sets "treq->listener" to NULL and unreference listener
179 * socket. treq->listener is used by the listener so it is protected by the
180 * fastopenq->lock in this function.
181 */
182void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
183 bool reset)
184{
185 struct sock *lsk = tcp_rsk(req)->listener;
186 struct fastopen_queue *fastopenq =
187 inet_csk(lsk)->icsk_accept_queue.fastopenq;
188
189 BUG_ON(!spin_is_locked(&sk->sk_lock.slock) && !sock_owned_by_user(sk));
190
191 tcp_sk(sk)->fastopen_rsk = NULL;
192 spin_lock_bh(&fastopenq->lock);
193 fastopenq->qlen--;
194 tcp_rsk(req)->listener = NULL;
195 if (req->sk) /* the child socket hasn't been accepted yet */
196 goto out;
197
198 if (!reset || lsk->sk_state != TCP_LISTEN) {
199 /* If the listener has been closed don't bother with the
200 * special RST handling below.
201 */
202 spin_unlock_bh(&fastopenq->lock);
203 sock_put(lsk);
204 reqsk_free(req);
205 return;
206 }
207 /* Wait for 60secs before removing a req that has triggered RST.
208 * This is a simple defense against TFO spoofing attack - by
209 * counting the req against fastopen.max_qlen, and disabling
210 * TFO when the qlen exceeds max_qlen.
211 *
212 * For more details see CoNext'11 "TCP Fast Open" paper.
213 */
214 req->expires = jiffies + 60*HZ;
215 if (fastopenq->rskq_rst_head == NULL)
216 fastopenq->rskq_rst_head = req;
217 else
218 fastopenq->rskq_rst_tail->dl_next = req;
219
220 req->dl_next = NULL;
221 fastopenq->rskq_rst_tail = req;
222 fastopenq->qlen++;
223out:
224 spin_unlock_bh(&fastopenq->lock);
225 sock_put(lsk);
226 return;
227}
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 6681ccf5c3ee..4f70ef0b946d 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -149,6 +149,11 @@ void inet_sock_destruct(struct sock *sk)
149 pr_err("Attempt to release alive inet socket %p\n", sk); 149 pr_err("Attempt to release alive inet socket %p\n", sk);
150 return; 150 return;
151 } 151 }
152 if (sk->sk_type == SOCK_STREAM) {
153 struct fastopen_queue *fastopenq =
154 inet_csk(sk)->icsk_accept_queue.fastopenq;
155 kfree(fastopenq);
156 }
152 157
153 WARN_ON(atomic_read(&sk->sk_rmem_alloc)); 158 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
154 WARN_ON(atomic_read(&sk->sk_wmem_alloc)); 159 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
@@ -212,6 +217,26 @@ int inet_listen(struct socket *sock, int backlog)
212 * we can only allow the backlog to be adjusted. 217 * we can only allow the backlog to be adjusted.
213 */ 218 */
214 if (old_state != TCP_LISTEN) { 219 if (old_state != TCP_LISTEN) {
220 /* Check special setups for testing purpose to enable TFO w/o
221 * requiring TCP_FASTOPEN sockopt.
222 * Note that only TCP sockets (SOCK_STREAM) will reach here.
223 * Also fastopenq may already been allocated because this
224 * socket was in TCP_LISTEN state previously but was
225 * shutdown() (rather than close()).
226 */
227 if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 &&
228 inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) {
229 if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0)
230 err = fastopen_init_queue(sk, backlog);
231 else if ((sysctl_tcp_fastopen &
232 TFO_SERVER_WO_SOCKOPT2) != 0)
233 err = fastopen_init_queue(sk,
234 ((uint)sysctl_tcp_fastopen) >> 16);
235 else
236 err = 0;
237 if (err)
238 goto out;
239 }
215 err = inet_csk_listen_start(sk, backlog); 240 err = inet_csk_listen_start(sk, backlog);
216 if (err) 241 if (err)
217 goto out; 242 goto out;
@@ -701,7 +726,8 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)
701 726
702 sock_rps_record_flow(sk2); 727 sock_rps_record_flow(sk2);
703 WARN_ON(!((1 << sk2->sk_state) & 728 WARN_ON(!((1 << sk2->sk_state) &
704 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE))); 729 (TCPF_ESTABLISHED | TCPF_SYN_RECV |
730 TCPF_CLOSE_WAIT | TCPF_CLOSE)));
705 731
706 sock_graft(sk2, newsock); 732 sock_graft(sk2, newsock);
707 733
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 7f75f21d7b83..8464b79c493f 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -283,7 +283,9 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
283struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) 283struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
284{ 284{
285 struct inet_connection_sock *icsk = inet_csk(sk); 285 struct inet_connection_sock *icsk = inet_csk(sk);
286 struct request_sock_queue *queue = &icsk->icsk_accept_queue;
286 struct sock *newsk; 287 struct sock *newsk;
288 struct request_sock *req;
287 int error; 289 int error;
288 290
289 lock_sock(sk); 291 lock_sock(sk);
@@ -296,7 +298,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
296 goto out_err; 298 goto out_err;
297 299
298 /* Find already established connection */ 300 /* Find already established connection */
299 if (reqsk_queue_empty(&icsk->icsk_accept_queue)) { 301 if (reqsk_queue_empty(queue)) {
300 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); 302 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
301 303
302 /* If this is a non blocking socket don't sleep */ 304 /* If this is a non blocking socket don't sleep */
@@ -308,14 +310,32 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
308 if (error) 310 if (error)
309 goto out_err; 311 goto out_err;
310 } 312 }
311 313 req = reqsk_queue_remove(queue);
312 newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk); 314 newsk = req->sk;
313 WARN_ON(newsk->sk_state == TCP_SYN_RECV); 315
316 sk_acceptq_removed(sk);
317 if (sk->sk_type == SOCK_STREAM && queue->fastopenq != NULL) {
318 spin_lock_bh(&queue->fastopenq->lock);
319 if (tcp_rsk(req)->listener) {
320 /* We are still waiting for the final ACK from 3WHS
321 * so can't free req now. Instead, we set req->sk to
322 * NULL to signify that the child socket is taken
323 * so reqsk_fastopen_remove() will free the req
324 * when 3WHS finishes (or is aborted).
325 */
326 req->sk = NULL;
327 req = NULL;
328 }
329 spin_unlock_bh(&queue->fastopenq->lock);
330 }
314out: 331out:
315 release_sock(sk); 332 release_sock(sk);
333 if (req)
334 __reqsk_free(req);
316 return newsk; 335 return newsk;
317out_err: 336out_err:
318 newsk = NULL; 337 newsk = NULL;
338 req = NULL;
319 *err = error; 339 *err = error;
320 goto out; 340 goto out;
321} 341}
@@ -720,13 +740,14 @@ EXPORT_SYMBOL_GPL(inet_csk_listen_start);
720void inet_csk_listen_stop(struct sock *sk) 740void inet_csk_listen_stop(struct sock *sk)
721{ 741{
722 struct inet_connection_sock *icsk = inet_csk(sk); 742 struct inet_connection_sock *icsk = inet_csk(sk);
743 struct request_sock_queue *queue = &icsk->icsk_accept_queue;
723 struct request_sock *acc_req; 744 struct request_sock *acc_req;
724 struct request_sock *req; 745 struct request_sock *req;
725 746
726 inet_csk_delete_keepalive_timer(sk); 747 inet_csk_delete_keepalive_timer(sk);
727 748
728 /* make all the listen_opt local to us */ 749 /* make all the listen_opt local to us */
729 acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue); 750 acc_req = reqsk_queue_yank_acceptq(queue);
730 751
731 /* Following specs, it would be better either to send FIN 752 /* Following specs, it would be better either to send FIN
732 * (and enter FIN-WAIT-1, it is normal close) 753 * (and enter FIN-WAIT-1, it is normal close)
@@ -736,7 +757,7 @@ void inet_csk_listen_stop(struct sock *sk)
736 * To be honest, we are not able to make either 757 * To be honest, we are not able to make either
737 * of the variants now. --ANK 758 * of the variants now. --ANK
738 */ 759 */
739 reqsk_queue_destroy(&icsk->icsk_accept_queue); 760 reqsk_queue_destroy(queue);
740 761
741 while ((req = acc_req) != NULL) { 762 while ((req = acc_req) != NULL) {
742 struct sock *child = req->sk; 763 struct sock *child = req->sk;
@@ -754,6 +775,19 @@ void inet_csk_listen_stop(struct sock *sk)
754 775
755 percpu_counter_inc(sk->sk_prot->orphan_count); 776 percpu_counter_inc(sk->sk_prot->orphan_count);
756 777
778 if (sk->sk_type == SOCK_STREAM && tcp_rsk(req)->listener) {
779 BUG_ON(tcp_sk(child)->fastopen_rsk != req);
780 BUG_ON(sk != tcp_rsk(req)->listener);
781
782 /* Paranoid, to prevent race condition if
783 * an inbound pkt destined for child is
784 * blocked by sock lock in tcp_v4_rcv().
785 * Also to satisfy an assertion in
786 * tcp_v4_destroy_sock().
787 */
788 tcp_sk(child)->fastopen_rsk = NULL;
789 sock_put(sk);
790 }
757 inet_csk_destroy_sock(child); 791 inet_csk_destroy_sock(child);
758 792
759 bh_unlock_sock(child); 793 bh_unlock_sock(child);
@@ -763,6 +797,17 @@ void inet_csk_listen_stop(struct sock *sk)
763 sk_acceptq_removed(sk); 797 sk_acceptq_removed(sk);
764 __reqsk_free(req); 798 __reqsk_free(req);
765 } 799 }
800 if (queue->fastopenq != NULL) {
801 /* Free all the reqs queued in rskq_rst_head. */
802 spin_lock_bh(&queue->fastopenq->lock);
803 acc_req = queue->fastopenq->rskq_rst_head;
804 queue->fastopenq->rskq_rst_head = NULL;
805 spin_unlock_bh(&queue->fastopenq->lock);
806 while ((req = acc_req) != NULL) {
807 acc_req = req->dl_next;
808 __reqsk_free(req);
809 }
810 }
766 WARN_ON(sk->sk_ack_backlog); 811 WARN_ON(sk->sk_ack_backlog);
767} 812}
768EXPORT_SYMBOL_GPL(inet_csk_listen_stop); 813EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 957acd12250b..8de53e1ddd54 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -263,6 +263,10 @@ static const struct snmp_mib snmp4_net_list[] = {
263 SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK), 263 SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK),
264 SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE), 264 SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE),
265 SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE), 265 SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE),
266 SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE),
267 SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL),
268 SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW),
269 SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD),
266 SNMP_MIB_SENTINEL 270 SNMP_MIB_SENTINEL
267}; 271};
268 272
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 650e1528e1e6..ba48e799b031 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -319,6 +319,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
319 ireq->tstamp_ok = tcp_opt.saw_tstamp; 319 ireq->tstamp_ok = tcp_opt.saw_tstamp;
320 req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; 320 req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
321 treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; 321 treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
322 treq->listener = NULL;
322 323
323 /* We throwed the options of the initial SYN away, so we hope 324 /* We throwed the options of the initial SYN away, so we hope
324 * the ACK carries the same options again (see RFC1122 4.2.3.8) 325 * the ACK carries the same options again (see RFC1122 4.2.3.8)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 3e78c79b5586..9205e492dc9d 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -232,6 +232,45 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write,
232 return 0; 232 return 0;
233} 233}
234 234
235int proc_tcp_fastopen_key(ctl_table *ctl, int write, void __user *buffer,
236 size_t *lenp, loff_t *ppos)
237{
238 ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) };
239 struct tcp_fastopen_context *ctxt;
240 int ret;
241 u32 user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */
242
243 tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL);
244 if (!tbl.data)
245 return -ENOMEM;
246
247 rcu_read_lock();
248 ctxt = rcu_dereference(tcp_fastopen_ctx);
249 if (ctxt)
250 memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH);
251 rcu_read_unlock();
252
253 snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x",
254 user_key[0], user_key[1], user_key[2], user_key[3]);
255 ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
256
257 if (write && ret == 0) {
258 if (sscanf(tbl.data, "%x-%x-%x-%x", user_key, user_key + 1,
259 user_key + 2, user_key + 3) != 4) {
260 ret = -EINVAL;
261 goto bad_key;
262 }
263 tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH);
264 }
265
266bad_key:
267 pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n",
268 user_key[0], user_key[1], user_key[2], user_key[3],
269 (char *)tbl.data, ret);
270 kfree(tbl.data);
271 return ret;
272}
273
235static struct ctl_table ipv4_table[] = { 274static struct ctl_table ipv4_table[] = {
236 { 275 {
237 .procname = "tcp_timestamps", 276 .procname = "tcp_timestamps",
@@ -386,6 +425,12 @@ static struct ctl_table ipv4_table[] = {
386 .proc_handler = proc_dointvec, 425 .proc_handler = proc_dointvec,
387 }, 426 },
388 { 427 {
428 .procname = "tcp_fastopen_key",
429 .mode = 0600,
430 .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10),
431 .proc_handler = proc_tcp_fastopen_key,
432 },
433 {
389 .procname = "tcp_tw_recycle", 434 .procname = "tcp_tw_recycle",
390 .data = &tcp_death_row.sysctl_tw_recycle, 435 .data = &tcp_death_row.sysctl_tw_recycle,
391 .maxlen = sizeof(int), 436 .maxlen = sizeof(int),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2109ff4a1daf..df83d744e380 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -486,8 +486,9 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
486 if (sk->sk_shutdown & RCV_SHUTDOWN) 486 if (sk->sk_shutdown & RCV_SHUTDOWN)
487 mask |= POLLIN | POLLRDNORM | POLLRDHUP; 487 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
488 488
489 /* Connected? */ 489 /* Connected or passive Fast Open socket? */
490 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { 490 if (sk->sk_state != TCP_SYN_SENT &&
491 (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) {
491 int target = sock_rcvlowat(sk, 0, INT_MAX); 492 int target = sock_rcvlowat(sk, 0, INT_MAX);
492 493
493 if (tp->urg_seq == tp->copied_seq && 494 if (tp->urg_seq == tp->copied_seq &&
@@ -840,10 +841,15 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
840 ssize_t copied; 841 ssize_t copied;
841 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 842 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
842 843
843 /* Wait for a connection to finish. */ 844 /* Wait for a connection to finish. One exception is TCP Fast Open
844 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) 845 * (passive side) where data is allowed to be sent before a connection
846 * is fully established.
847 */
848 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
849 !tcp_passive_fastopen(sk)) {
845 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) 850 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
846 goto out_err; 851 goto out_err;
852 }
847 853
848 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 854 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
849 855
@@ -1042,10 +1048,15 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1042 1048
1043 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 1049 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1044 1050
1045 /* Wait for a connection to finish. */ 1051 /* Wait for a connection to finish. One exception is TCP Fast Open
1046 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) 1052 * (passive side) where data is allowed to be sent before a connection
1053 * is fully established.
1054 */
1055 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1056 !tcp_passive_fastopen(sk)) {
1047 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) 1057 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
1048 goto do_error; 1058 goto do_error;
1059 }
1049 1060
1050 if (unlikely(tp->repair)) { 1061 if (unlikely(tp->repair)) {
1051 if (tp->repair_queue == TCP_RECV_QUEUE) { 1062 if (tp->repair_queue == TCP_RECV_QUEUE) {
@@ -2144,6 +2155,10 @@ void tcp_close(struct sock *sk, long timeout)
2144 * they look as CLOSING or LAST_ACK for Linux) 2155 * they look as CLOSING or LAST_ACK for Linux)
2145 * Probably, I missed some more holelets. 2156 * Probably, I missed some more holelets.
2146 * --ANK 2157 * --ANK
2158 * XXX (TFO) - To start off we don't support SYN+ACK+FIN
2159 * in a single packet! (May consider it later but will
2160 * probably need API support or TCP_CORK SYN-ACK until
2161 * data is written and socket is closed.)
2147 */ 2162 */
2148 tcp_send_fin(sk); 2163 tcp_send_fin(sk);
2149 } 2164 }
@@ -2215,8 +2230,16 @@ adjudge_to_death:
2215 } 2230 }
2216 } 2231 }
2217 2232
2218 if (sk->sk_state == TCP_CLOSE) 2233 if (sk->sk_state == TCP_CLOSE) {
2234 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
2235 /* We could get here with a non-NULL req if the socket is
2236 * aborted (e.g., closed with unread data) before 3WHS
2237 * finishes.
2238 */
2239 if (req != NULL)
2240 reqsk_fastopen_remove(sk, req, false);
2219 inet_csk_destroy_sock(sk); 2241 inet_csk_destroy_sock(sk);
2242 }
2220 /* Otherwise, socket is reprieved until protocol close. */ 2243 /* Otherwise, socket is reprieved until protocol close. */
2221 2244
2222out: 2245out:
@@ -2688,6 +2711,14 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2688 else 2711 else
2689 icsk->icsk_user_timeout = msecs_to_jiffies(val); 2712 icsk->icsk_user_timeout = msecs_to_jiffies(val);
2690 break; 2713 break;
2714
2715 case TCP_FASTOPEN:
2716 if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
2717 TCPF_LISTEN)))
2718 err = fastopen_init_queue(sk, val);
2719 else
2720 err = -EINVAL;
2721 break;
2691 default: 2722 default:
2692 err = -ENOPROTOOPT; 2723 err = -ENOPROTOOPT;
2693 break; 2724 break;
@@ -3501,11 +3532,15 @@ EXPORT_SYMBOL(tcp_cookie_generator);
3501 3532
3502void tcp_done(struct sock *sk) 3533void tcp_done(struct sock *sk)
3503{ 3534{
3535 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
3536
3504 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) 3537 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
3505 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); 3538 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
3506 3539
3507 tcp_set_state(sk, TCP_CLOSE); 3540 tcp_set_state(sk, TCP_CLOSE);
3508 tcp_clear_xmit_timers(sk); 3541 tcp_clear_xmit_timers(sk);
3542 if (req != NULL)
3543 reqsk_fastopen_remove(sk, req, false);
3509 3544
3510 sk->sk_shutdown = SHUTDOWN_MASK; 3545 sk->sk_shutdown = SHUTDOWN_MASK;
3511 3546
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index a7f729c409d7..8f7ef0ad80e5 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -1,10 +1,91 @@
1#include <linux/err.h>
1#include <linux/init.h> 2#include <linux/init.h>
2#include <linux/kernel.h> 3#include <linux/kernel.h>
4#include <linux/list.h>
5#include <linux/tcp.h>
6#include <linux/rcupdate.h>
7#include <linux/rculist.h>
8#include <net/inetpeer.h>
9#include <net/tcp.h>
3 10
4int sysctl_tcp_fastopen; 11int sysctl_tcp_fastopen __read_mostly;
12
13struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
14
15static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock);
16
17static void tcp_fastopen_ctx_free(struct rcu_head *head)
18{
19 struct tcp_fastopen_context *ctx =
20 container_of(head, struct tcp_fastopen_context, rcu);
21 crypto_free_cipher(ctx->tfm);
22 kfree(ctx);
23}
24
25int tcp_fastopen_reset_cipher(void *key, unsigned int len)
26{
27 int err;
28 struct tcp_fastopen_context *ctx, *octx;
29
30 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
31 if (!ctx)
32 return -ENOMEM;
33 ctx->tfm = crypto_alloc_cipher("aes", 0, 0);
34
35 if (IS_ERR(ctx->tfm)) {
36 err = PTR_ERR(ctx->tfm);
37error: kfree(ctx);
38 pr_err("TCP: TFO aes cipher alloc error: %d\n", err);
39 return err;
40 }
41 err = crypto_cipher_setkey(ctx->tfm, key, len);
42 if (err) {
43 pr_err("TCP: TFO cipher key error: %d\n", err);
44 crypto_free_cipher(ctx->tfm);
45 goto error;
46 }
47 memcpy(ctx->key, key, len);
48
49 spin_lock(&tcp_fastopen_ctx_lock);
50
51 octx = rcu_dereference_protected(tcp_fastopen_ctx,
52 lockdep_is_held(&tcp_fastopen_ctx_lock));
53 rcu_assign_pointer(tcp_fastopen_ctx, ctx);
54 spin_unlock(&tcp_fastopen_ctx_lock);
55
56 if (octx)
57 call_rcu(&octx->rcu, tcp_fastopen_ctx_free);
58 return err;
59}
60
61/* Computes the fastopen cookie for the peer.
62 * The peer address is a 128 bits long (pad with zeros for IPv4).
63 *
64 * The caller must check foc->len to determine if a valid cookie
65 * has been generated successfully.
66*/
67void tcp_fastopen_cookie_gen(__be32 addr, struct tcp_fastopen_cookie *foc)
68{
69 __be32 peer_addr[4] = { addr, 0, 0, 0 };
70 struct tcp_fastopen_context *ctx;
71
72 rcu_read_lock();
73 ctx = rcu_dereference(tcp_fastopen_ctx);
74 if (ctx) {
75 crypto_cipher_encrypt_one(ctx->tfm,
76 foc->val,
77 (__u8 *)peer_addr);
78 foc->len = TCP_FASTOPEN_COOKIE_SIZE;
79 }
80 rcu_read_unlock();
81}
5 82
6static int __init tcp_fastopen_init(void) 83static int __init tcp_fastopen_init(void)
7{ 84{
85 __u8 key[TCP_FASTOPEN_KEY_LENGTH];
86
87 get_random_bytes(key, sizeof(key));
88 tcp_fastopen_reset_cipher(key, sizeof(key));
8 return 0; 89 return 0;
9} 90}
10 91
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ce4ffe9ed556..8c304a400798 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -378,7 +378,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
378/* 4. Try to fixup all. It is made immediately after connection enters 378/* 4. Try to fixup all. It is made immediately after connection enters
379 * established state. 379 * established state.
380 */ 380 */
381static void tcp_init_buffer_space(struct sock *sk) 381void tcp_init_buffer_space(struct sock *sk)
382{ 382{
383 struct tcp_sock *tp = tcp_sk(sk); 383 struct tcp_sock *tp = tcp_sk(sk);
384 int maxwin; 384 int maxwin;
@@ -3127,6 +3127,12 @@ void tcp_rearm_rto(struct sock *sk)
3127{ 3127{
3128 struct tcp_sock *tp = tcp_sk(sk); 3128 struct tcp_sock *tp = tcp_sk(sk);
3129 3129
3130 /* If the retrans timer is currently being used by Fast Open
3131 * for SYN-ACK retrans purpose, stay put.
3132 */
3133 if (tp->fastopen_rsk)
3134 return;
3135
3130 if (!tp->packets_out) { 3136 if (!tp->packets_out) {
3131 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); 3137 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
3132 } else { 3138 } else {
@@ -4038,7 +4044,7 @@ static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
4038} 4044}
4039 4045
4040/* When we get a reset we do this. */ 4046/* When we get a reset we do this. */
4041static void tcp_reset(struct sock *sk) 4047void tcp_reset(struct sock *sk)
4042{ 4048{
4043 /* We want the right error as BSD sees it (and indeed as we do). */ 4049 /* We want the right error as BSD sees it (and indeed as we do). */
4044 switch (sk->sk_state) { 4050 switch (sk->sk_state) {
@@ -5895,7 +5901,9 @@ discard:
5895 tcp_send_synack(sk); 5901 tcp_send_synack(sk);
5896#if 0 5902#if 0
5897 /* Note, we could accept data and URG from this segment. 5903 /* Note, we could accept data and URG from this segment.
5898 * There are no obstacles to make this. 5904 * There are no obstacles to make this (except that we must
5905 * either change tcp_recvmsg() to prevent it from returning data
5906 * before 3WHS completes per RFC793, or employ TCP Fast Open).
5899 * 5907 *
5900 * However, if we ignore data in ACKless segments sometimes, 5908 * However, if we ignore data in ACKless segments sometimes,
5901 * we have no reasons to accept it sometimes. 5909 * we have no reasons to accept it sometimes.
@@ -5935,6 +5943,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5935{ 5943{
5936 struct tcp_sock *tp = tcp_sk(sk); 5944 struct tcp_sock *tp = tcp_sk(sk);
5937 struct inet_connection_sock *icsk = inet_csk(sk); 5945 struct inet_connection_sock *icsk = inet_csk(sk);
5946 struct request_sock *req;
5938 int queued = 0; 5947 int queued = 0;
5939 5948
5940 tp->rx_opt.saw_tstamp = 0; 5949 tp->rx_opt.saw_tstamp = 0;
@@ -5990,7 +5999,14 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5990 return 0; 5999 return 0;
5991 } 6000 }
5992 6001
5993 if (!tcp_validate_incoming(sk, skb, th, 0)) 6002 req = tp->fastopen_rsk;
6003 if (req != NULL) {
6004 BUG_ON(sk->sk_state != TCP_SYN_RECV &&
6005 sk->sk_state != TCP_FIN_WAIT1);
6006
6007 if (tcp_check_req(sk, skb, req, NULL, true) == NULL)
6008 goto discard;
6009 } else if (!tcp_validate_incoming(sk, skb, th, 0))
5994 return 0; 6010 return 0;
5995 6011
5996 /* step 5: check the ACK field */ 6012 /* step 5: check the ACK field */
@@ -6000,7 +6016,22 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
6000 switch (sk->sk_state) { 6016 switch (sk->sk_state) {
6001 case TCP_SYN_RECV: 6017 case TCP_SYN_RECV:
6002 if (acceptable) { 6018 if (acceptable) {
6003 tp->copied_seq = tp->rcv_nxt; 6019 /* Once we leave TCP_SYN_RECV, we no longer
6020 * need req so release it.
6021 */
6022 if (req) {
6023 reqsk_fastopen_remove(sk, req, false);
6024 } else {
6025 /* Make sure socket is routed, for
6026 * correct metrics.
6027 */
6028 icsk->icsk_af_ops->rebuild_header(sk);
6029 tcp_init_congestion_control(sk);
6030
6031 tcp_mtup_init(sk);
6032 tcp_init_buffer_space(sk);
6033 tp->copied_seq = tp->rcv_nxt;
6034 }
6004 smp_mb(); 6035 smp_mb();
6005 tcp_set_state(sk, TCP_ESTABLISHED); 6036 tcp_set_state(sk, TCP_ESTABLISHED);
6006 sk->sk_state_change(sk); 6037 sk->sk_state_change(sk);
@@ -6022,23 +6053,27 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
6022 if (tp->rx_opt.tstamp_ok) 6053 if (tp->rx_opt.tstamp_ok)
6023 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 6054 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
6024 6055
6025 /* Make sure socket is routed, for 6056 if (req) {
6026 * correct metrics. 6057 /* Re-arm the timer because data may
6027 */ 6058 * have been sent out. This is similar
6028 icsk->icsk_af_ops->rebuild_header(sk); 6059 * to the regular data transmission case
6029 6060 * when new data has just been ack'ed.
6030 tcp_init_metrics(sk); 6061 *
6031 6062 * (TFO) - we could try to be more
6032 tcp_init_congestion_control(sk); 6063 * aggressive and retranmitting any data
6064 * sooner based on when they were sent
6065 * out.
6066 */
6067 tcp_rearm_rto(sk);
6068 } else
6069 tcp_init_metrics(sk);
6033 6070
6034 /* Prevent spurious tcp_cwnd_restart() on 6071 /* Prevent spurious tcp_cwnd_restart() on
6035 * first data packet. 6072 * first data packet.
6036 */ 6073 */
6037 tp->lsndtime = tcp_time_stamp; 6074 tp->lsndtime = tcp_time_stamp;
6038 6075
6039 tcp_mtup_init(sk);
6040 tcp_initialize_rcv_mss(sk); 6076 tcp_initialize_rcv_mss(sk);
6041 tcp_init_buffer_space(sk);
6042 tcp_fast_path_on(tp); 6077 tcp_fast_path_on(tp);
6043 } else { 6078 } else {
6044 return 1; 6079 return 1;
@@ -6046,6 +6081,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
6046 break; 6081 break;
6047 6082
6048 case TCP_FIN_WAIT1: 6083 case TCP_FIN_WAIT1:
6084 /* If we enter the TCP_FIN_WAIT1 state and we are a
6085 * Fast Open socket and this is the first acceptable
6086 * ACK we have received, this would have acknowledged
6087 * our SYNACK so stop the SYNACK timer.
6088 */
6089 if (acceptable && req != NULL) {
6090 /* We no longer need the request sock. */
6091 reqsk_fastopen_remove(sk, req, false);
6092 tcp_rearm_rto(sk);
6093 }
6049 if (tp->snd_una == tp->write_seq) { 6094 if (tp->snd_una == tp->write_seq) {
6050 struct dst_entry *dst; 6095 struct dst_entry *dst;
6051 6096
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 36f02f954ac1..e64abed249cc 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -352,6 +352,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
352 const int code = icmp_hdr(icmp_skb)->code; 352 const int code = icmp_hdr(icmp_skb)->code;
353 struct sock *sk; 353 struct sock *sk;
354 struct sk_buff *skb; 354 struct sk_buff *skb;
355 struct request_sock *req;
355 __u32 seq; 356 __u32 seq;
356 __u32 remaining; 357 __u32 remaining;
357 int err; 358 int err;
@@ -394,9 +395,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
394 395
395 icsk = inet_csk(sk); 396 icsk = inet_csk(sk);
396 tp = tcp_sk(sk); 397 tp = tcp_sk(sk);
398 req = tp->fastopen_rsk;
397 seq = ntohl(th->seq); 399 seq = ntohl(th->seq);
398 if (sk->sk_state != TCP_LISTEN && 400 if (sk->sk_state != TCP_LISTEN &&
399 !between(seq, tp->snd_una, tp->snd_nxt)) { 401 !between(seq, tp->snd_una, tp->snd_nxt) &&
402 (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
403 /* For a Fast Open socket, allow seq to be snt_isn. */
400 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); 404 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
401 goto out; 405 goto out;
402 } 406 }
@@ -435,6 +439,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
435 !icsk->icsk_backoff) 439 !icsk->icsk_backoff)
436 break; 440 break;
437 441
442 /* XXX (TFO) - revisit the following logic for TFO */
443
438 if (sock_owned_by_user(sk)) 444 if (sock_owned_by_user(sk))
439 break; 445 break;
440 446
@@ -466,6 +472,14 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
466 goto out; 472 goto out;
467 } 473 }
468 474
475 /* XXX (TFO) - if it's a TFO socket and has been accepted, rather
476 * than following the TCP_SYN_RECV case and closing the socket,
477 * we ignore the ICMP error and keep trying like a fully established
478 * socket. Is this the right thing to do?
479 */
480 if (req && req->sk == NULL)
481 goto out;
482
469 switch (sk->sk_state) { 483 switch (sk->sk_state) {
470 struct request_sock *req, **prev; 484 struct request_sock *req, **prev;
471 case TCP_LISTEN: 485 case TCP_LISTEN:
@@ -498,7 +512,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
498 512
499 case TCP_SYN_SENT: 513 case TCP_SYN_SENT:
500 case TCP_SYN_RECV: /* Cannot happen. 514 case TCP_SYN_RECV: /* Cannot happen.
501 It can f.e. if SYNs crossed. 515 It can f.e. if SYNs crossed,
516 or Fast Open.
502 */ 517 */
503 if (!sock_owned_by_user(sk)) { 518 if (!sock_owned_by_user(sk)) {
504 sk->sk_err = err; 519 sk->sk_err = err;
@@ -809,8 +824,12 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
809static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, 824static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
810 struct request_sock *req) 825 struct request_sock *req)
811{ 826{
812 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, 827 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
813 tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, 828 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
829 */
830 tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
831 tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
832 tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
814 req->ts_recent, 833 req->ts_recent,
815 0, 834 0,
816 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, 835 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
@@ -839,7 +858,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
839 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 858 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
840 return -1; 859 return -1;
841 860
842 skb = tcp_make_synack(sk, dst, req, rvp); 861 skb = tcp_make_synack(sk, dst, req, rvp, NULL);
843 862
844 if (skb) { 863 if (skb) {
845 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr); 864 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
@@ -1272,6 +1291,178 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1272}; 1291};
1273#endif 1292#endif
1274 1293
1294static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
1295 struct request_sock *req,
1296 struct tcp_fastopen_cookie *foc,
1297 struct tcp_fastopen_cookie *valid_foc)
1298{
1299 bool skip_cookie = false;
1300 struct fastopen_queue *fastopenq;
1301
1302 if (likely(!fastopen_cookie_present(foc))) {
1303 /* See include/net/tcp.h for the meaning of these knobs */
1304 if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
1305 ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
1306 (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
1307 skip_cookie = true; /* no cookie to validate */
1308 else
1309 return false;
1310 }
1311 fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
1312 /* A FO option is present; bump the counter. */
1313 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
1314
1315 /* Make sure the listener has enabled fastopen, and we don't
1316 * exceed the max # of pending TFO requests allowed before trying
1317 * to validating the cookie in order to avoid burning CPU cycles
1318 * unnecessarily.
1319 *
1320 * XXX (TFO) - The implication of checking the max_qlen before
1321 * processing a cookie request is that clients can't differentiate
1322 * between qlen overflow causing Fast Open to be disabled
1323 * temporarily vs a server not supporting Fast Open at all.
1324 */
1325 if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
1326 fastopenq == NULL || fastopenq->max_qlen == 0)
1327 return false;
1328
1329 if (fastopenq->qlen >= fastopenq->max_qlen) {
1330 struct request_sock *req1;
1331 spin_lock(&fastopenq->lock);
1332 req1 = fastopenq->rskq_rst_head;
1333 if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
1334 spin_unlock(&fastopenq->lock);
1335 NET_INC_STATS_BH(sock_net(sk),
1336 LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
1337 /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
1338 foc->len = -1;
1339 return false;
1340 }
1341 fastopenq->rskq_rst_head = req1->dl_next;
1342 fastopenq->qlen--;
1343 spin_unlock(&fastopenq->lock);
1344 reqsk_free(req1);
1345 }
1346 if (skip_cookie) {
1347 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1348 return true;
1349 }
1350 if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
1351 if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
1352 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1353 if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
1354 memcmp(&foc->val[0], &valid_foc->val[0],
1355 TCP_FASTOPEN_COOKIE_SIZE) != 0)
1356 return false;
1357 valid_foc->len = -1;
1358 }
1359 /* Acknowledge the data received from the peer. */
1360 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1361 return true;
1362 } else if (foc->len == 0) { /* Client requesting a cookie */
1363 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1364 NET_INC_STATS_BH(sock_net(sk),
1365 LINUX_MIB_TCPFASTOPENCOOKIEREQD);
1366 } else {
1367 /* Client sent a cookie with wrong size. Treat it
1368 * the same as invalid and return a valid one.
1369 */
1370 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1371 }
1372 return false;
1373}
1374
1375static int tcp_v4_conn_req_fastopen(struct sock *sk,
1376 struct sk_buff *skb,
1377 struct sk_buff *skb_synack,
1378 struct request_sock *req,
1379 struct request_values *rvp)
1380{
1381 struct tcp_sock *tp = tcp_sk(sk);
1382 struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
1383 const struct inet_request_sock *ireq = inet_rsk(req);
1384 struct sock *child;
1385
1386 req->retrans = 0;
1387 req->sk = NULL;
1388
1389 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
1390 if (child == NULL) {
1391 NET_INC_STATS_BH(sock_net(sk),
1392 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1393 kfree_skb(skb_synack);
1394 return -1;
1395 }
1396 ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1397 ireq->rmt_addr, ireq->opt);
1398 /* XXX (TFO) - is it ok to ignore error and continue? */
1399
1400 spin_lock(&queue->fastopenq->lock);
1401 queue->fastopenq->qlen++;
1402 spin_unlock(&queue->fastopenq->lock);
1403
1404 /* Initialize the child socket. Have to fix some values to take
1405 * into account the child is a Fast Open socket and is created
1406 * only out of the bits carried in the SYN packet.
1407 */
1408 tp = tcp_sk(child);
1409
1410 tp->fastopen_rsk = req;
1411 /* Do a hold on the listner sk so that if the listener is being
1412 * closed, the child that has been accepted can live on and still
1413 * access listen_lock.
1414 */
1415 sock_hold(sk);
1416 tcp_rsk(req)->listener = sk;
1417
1418 /* RFC1323: The window in SYN & SYN/ACK segments is never
1419 * scaled. So correct it appropriately.
1420 */
1421 tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
1422
1423 /* Activate the retrans timer so that SYNACK can be retransmitted.
1424 * The request socket is not added to the SYN table of the parent
1425 * because it's been added to the accept queue directly.
1426 */
1427 inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
1428 TCP_TIMEOUT_INIT, TCP_RTO_MAX);
1429
1430 /* Add the child socket directly into the accept queue */
1431 inet_csk_reqsk_queue_add(sk, req, child);
1432
1433 /* Now finish processing the fastopen child socket. */
1434 inet_csk(child)->icsk_af_ops->rebuild_header(child);
1435 tcp_init_congestion_control(child);
1436 tcp_mtup_init(child);
1437 tcp_init_buffer_space(child);
1438 tcp_init_metrics(child);
1439
1440 /* Queue the data carried in the SYN packet. We need to first
1441 * bump skb's refcnt because the caller will attempt to free it.
1442 *
1443 * XXX (TFO) - we honor a zero-payload TFO request for now.
1444 * (Any reason not to?)
1445 */
1446 if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
1447 /* Don't queue the skb if there is no payload in SYN.
1448 * XXX (TFO) - How about SYN+FIN?
1449 */
1450 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1451 } else {
1452 skb = skb_get(skb);
1453 skb_dst_drop(skb);
1454 __skb_pull(skb, tcp_hdr(skb)->doff * 4);
1455 skb_set_owner_r(skb, child);
1456 __skb_queue_tail(&child->sk_receive_queue, skb);
1457 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1458 }
1459 sk->sk_data_ready(sk, 0);
1460 bh_unlock_sock(child);
1461 sock_put(child);
1462 WARN_ON(req->sk == NULL);
1463 return 0;
1464}
1465
1275int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1466int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1276{ 1467{
1277 struct tcp_extend_values tmp_ext; 1468 struct tcp_extend_values tmp_ext;
@@ -1285,6 +1476,11 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1285 __be32 daddr = ip_hdr(skb)->daddr; 1476 __be32 daddr = ip_hdr(skb)->daddr;
1286 __u32 isn = TCP_SKB_CB(skb)->when; 1477 __u32 isn = TCP_SKB_CB(skb)->when;
1287 bool want_cookie = false; 1478 bool want_cookie = false;
1479 struct flowi4 fl4;
1480 struct tcp_fastopen_cookie foc = { .len = -1 };
1481 struct tcp_fastopen_cookie valid_foc = { .len = -1 };
1482 struct sk_buff *skb_synack;
1483 int do_fastopen;
1288 1484
1289 /* Never answer to SYNs send to broadcast or multicast */ 1485 /* Never answer to SYNs send to broadcast or multicast */
1290 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1486 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
@@ -1319,7 +1515,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1319 tcp_clear_options(&tmp_opt); 1515 tcp_clear_options(&tmp_opt);
1320 tmp_opt.mss_clamp = TCP_MSS_DEFAULT; 1516 tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1321 tmp_opt.user_mss = tp->rx_opt.user_mss; 1517 tmp_opt.user_mss = tp->rx_opt.user_mss;
1322 tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); 1518 tcp_parse_options(skb, &tmp_opt, &hash_location, 0,
1519 want_cookie ? NULL : &foc);
1323 1520
1324 if (tmp_opt.cookie_plus > 0 && 1521 if (tmp_opt.cookie_plus > 0 &&
1325 tmp_opt.saw_tstamp && 1522 tmp_opt.saw_tstamp &&
@@ -1377,8 +1574,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1377 isn = cookie_v4_init_sequence(sk, skb, &req->mss); 1574 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1378 req->cookie_ts = tmp_opt.tstamp_ok; 1575 req->cookie_ts = tmp_opt.tstamp_ok;
1379 } else if (!isn) { 1576 } else if (!isn) {
1380 struct flowi4 fl4;
1381
1382 /* VJ's idea. We save last timestamp seen 1577 /* VJ's idea. We save last timestamp seen
1383 * from the destination in peer table, when entering 1578 * from the destination in peer table, when entering
1384 * state TIME-WAIT, and check against it before 1579 * state TIME-WAIT, and check against it before
@@ -1419,14 +1614,52 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1419 tcp_rsk(req)->snt_isn = isn; 1614 tcp_rsk(req)->snt_isn = isn;
1420 tcp_rsk(req)->snt_synack = tcp_time_stamp; 1615 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1421 1616
1422 if (tcp_v4_send_synack(sk, dst, req, 1617 if (dst == NULL) {
1423 (struct request_values *)&tmp_ext, 1618 dst = inet_csk_route_req(sk, &fl4, req);
1424 skb_get_queue_mapping(skb), 1619 if (dst == NULL)
1425 want_cookie) || 1620 goto drop_and_free;
1426 want_cookie) 1621 }
1622 do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
1623
1624 /* We don't call tcp_v4_send_synack() directly because we need
1625 * to make sure a child socket can be created successfully before
1626 * sending back synack!
1627 *
1628 * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
1629 * (or better yet, call tcp_send_synack() in the child context
1630 * directly, but will have to fix bunch of other code first)
1631 * after syn_recv_sock() except one will need to first fix the
1632 * latter to remove its dependency on the current implementation
1633 * of tcp_v4_send_synack()->tcp_select_initial_window().
1634 */
1635 skb_synack = tcp_make_synack(sk, dst, req,
1636 (struct request_values *)&tmp_ext,
1637 fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
1638
1639 if (skb_synack) {
1640 __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
1641 skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
1642 } else
1643 goto drop_and_free;
1644
1645 if (likely(!do_fastopen)) {
1646 int err;
1647 err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1648 ireq->rmt_addr, ireq->opt);
1649 err = net_xmit_eval(err);
1650 if (err || want_cookie)
1651 goto drop_and_free;
1652
1653 tcp_rsk(req)->listener = NULL;
1654 /* Add the request_sock to the SYN table */
1655 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1656 if (fastopen_cookie_present(&foc) && foc.len != 0)
1657 NET_INC_STATS_BH(sock_net(sk),
1658 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1659 } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req,
1660 (struct request_values *)&tmp_ext))
1427 goto drop_and_free; 1661 goto drop_and_free;
1428 1662
1429 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1430 return 0; 1663 return 0;
1431 1664
1432drop_and_release: 1665drop_and_release:
@@ -1554,7 +1787,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1554 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, 1787 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1555 iph->saddr, iph->daddr); 1788 iph->saddr, iph->daddr);
1556 if (req) 1789 if (req)
1557 return tcp_check_req(sk, skb, req, prev); 1790 return tcp_check_req(sk, skb, req, prev, false);
1558 1791
1559 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, 1792 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1560 th->source, iph->daddr, th->dest, inet_iif(skb)); 1793 th->source, iph->daddr, th->dest, inet_iif(skb));
@@ -1977,6 +2210,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
1977 tcp_cookie_values_release); 2210 tcp_cookie_values_release);
1978 tp->cookie_values = NULL; 2211 tp->cookie_values = NULL;
1979 } 2212 }
2213 BUG_ON(tp->fastopen_rsk != NULL);
1980 2214
1981 /* If socket is aborted during connect operation */ 2215 /* If socket is aborted during connect operation */
1982 tcp_free_fastopen_req(tp); 2216 tcp_free_fastopen_req(tp);
@@ -2425,6 +2659,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2425 const struct tcp_sock *tp = tcp_sk(sk); 2659 const struct tcp_sock *tp = tcp_sk(sk);
2426 const struct inet_connection_sock *icsk = inet_csk(sk); 2660 const struct inet_connection_sock *icsk = inet_csk(sk);
2427 const struct inet_sock *inet = inet_sk(sk); 2661 const struct inet_sock *inet = inet_sk(sk);
2662 struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2428 __be32 dest = inet->inet_daddr; 2663 __be32 dest = inet->inet_daddr;
2429 __be32 src = inet->inet_rcv_saddr; 2664 __be32 src = inet->inet_rcv_saddr;
2430 __u16 destp = ntohs(inet->inet_dport); 2665 __u16 destp = ntohs(inet->inet_dport);
@@ -2469,7 +2704,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2469 jiffies_to_clock_t(icsk->icsk_ack.ato), 2704 jiffies_to_clock_t(icsk->icsk_ack.ato),
2470 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, 2705 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2471 tp->snd_cwnd, 2706 tp->snd_cwnd,
2472 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh, 2707 sk->sk_state == TCP_LISTEN ?
2708 (fastopenq ? fastopenq->max_qlen : 0) :
2709 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
2473 len); 2710 len);
2474} 2711}
2475 2712
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 6ff7f10dce9d..e965319d610b 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -507,6 +507,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
507 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; 507 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
508 newtp->rx_opt.mss_clamp = req->mss; 508 newtp->rx_opt.mss_clamp = req->mss;
509 TCP_ECN_openreq_child(newtp, req); 509 TCP_ECN_openreq_child(newtp, req);
510 newtp->fastopen_rsk = NULL;
510 511
511 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); 512 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
512 } 513 }
@@ -515,13 +516,18 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
515EXPORT_SYMBOL(tcp_create_openreq_child); 516EXPORT_SYMBOL(tcp_create_openreq_child);
516 517
517/* 518/*
518 * Process an incoming packet for SYN_RECV sockets represented 519 * Process an incoming packet for SYN_RECV sockets represented as a
519 * as a request_sock. 520 * request_sock. Normally sk is the listener socket but for TFO it
521 * points to the child socket.
522 *
523 * XXX (TFO) - The current impl contains a special check for ack
524 * validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
520 */ 525 */
521 526
522struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, 527struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
523 struct request_sock *req, 528 struct request_sock *req,
524 struct request_sock **prev) 529 struct request_sock **prev,
530 bool fastopen)
525{ 531{
526 struct tcp_options_received tmp_opt; 532 struct tcp_options_received tmp_opt;
527 const u8 *hash_location; 533 const u8 *hash_location;
@@ -530,6 +536,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
530 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); 536 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
531 bool paws_reject = false; 537 bool paws_reject = false;
532 538
539 BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));
540
533 tmp_opt.saw_tstamp = 0; 541 tmp_opt.saw_tstamp = 0;
534 if (th->doff > (sizeof(struct tcphdr)>>2)) { 542 if (th->doff > (sizeof(struct tcphdr)>>2)) {
535 tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); 543 tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
@@ -565,6 +573,9 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
565 * 573 *
566 * Enforce "SYN-ACK" according to figure 8, figure 6 574 * Enforce "SYN-ACK" according to figure 8, figure 6
567 * of RFC793, fixed by RFC1122. 575 * of RFC793, fixed by RFC1122.
576 *
577 * Note that even if there is new data in the SYN packet
578 * they will be thrown away too.
568 */ 579 */
569 req->rsk_ops->rtx_syn_ack(sk, req, NULL); 580 req->rsk_ops->rtx_syn_ack(sk, req, NULL);
570 return NULL; 581 return NULL;
@@ -622,9 +633,12 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
622 * sent (the segment carries an unacceptable ACK) ... 633 * sent (the segment carries an unacceptable ACK) ...
623 * a reset is sent." 634 * a reset is sent."
624 * 635 *
625 * Invalid ACK: reset will be sent by listening socket 636 * Invalid ACK: reset will be sent by listening socket.
637 * Note that the ACK validity check for a Fast Open socket is done
638 * elsewhere and is checked directly against the child socket rather
639 * than req because user data may have been sent out.
626 */ 640 */
627 if ((flg & TCP_FLAG_ACK) && 641 if ((flg & TCP_FLAG_ACK) && !fastopen &&
628 (TCP_SKB_CB(skb)->ack_seq != 642 (TCP_SKB_CB(skb)->ack_seq !=
629 tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk)))) 643 tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk))))
630 return sk; 644 return sk;
@@ -637,7 +651,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
637 /* RFC793: "first check sequence number". */ 651 /* RFC793: "first check sequence number". */
638 652
639 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, 653 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
640 tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) { 654 tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) {
641 /* Out of window: send ACK and drop. */ 655 /* Out of window: send ACK and drop. */
642 if (!(flg & TCP_FLAG_RST)) 656 if (!(flg & TCP_FLAG_RST))
643 req->rsk_ops->send_ack(sk, skb, req); 657 req->rsk_ops->send_ack(sk, skb, req);
@@ -648,7 +662,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
648 662
649 /* In sequence, PAWS is OK. */ 663 /* In sequence, PAWS is OK. */
650 664
651 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1)) 665 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
652 req->ts_recent = tmp_opt.rcv_tsval; 666 req->ts_recent = tmp_opt.rcv_tsval;
653 667
654 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { 668 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
@@ -667,10 +681,19 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
667 681
668 /* ACK sequence verified above, just make sure ACK is 682 /* ACK sequence verified above, just make sure ACK is
669 * set. If ACK not set, just silently drop the packet. 683 * set. If ACK not set, just silently drop the packet.
684 *
685 * XXX (TFO) - if we ever allow "data after SYN", the
686 * following check needs to be removed.
670 */ 687 */
671 if (!(flg & TCP_FLAG_ACK)) 688 if (!(flg & TCP_FLAG_ACK))
672 return NULL; 689 return NULL;
673 690
691 /* For Fast Open no more processing is needed (sk is the
692 * child socket).
693 */
694 if (fastopen)
695 return sk;
696
674 /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ 697 /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
675 if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && 698 if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
676 TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { 699 TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
@@ -706,11 +729,21 @@ listen_overflow:
706 } 729 }
707 730
708embryonic_reset: 731embryonic_reset:
709 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); 732 if (!(flg & TCP_FLAG_RST)) {
710 if (!(flg & TCP_FLAG_RST)) 733 /* Received a bad SYN pkt - for TFO We try not to reset
734 * the local connection unless it's really necessary to
735 * avoid becoming vulnerable to outside attack aiming at
736 * resetting legit local connections.
737 */
711 req->rsk_ops->send_reset(sk, skb); 738 req->rsk_ops->send_reset(sk, skb);
712 739 } else if (fastopen) { /* received a valid RST pkt */
713 inet_csk_reqsk_queue_drop(sk, req, prev); 740 reqsk_fastopen_remove(sk, req, true);
741 tcp_reset(sk);
742 }
743 if (!fastopen) {
744 inet_csk_reqsk_queue_drop(sk, req, prev);
745 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
746 }
714 return NULL; 747 return NULL;
715} 748}
716EXPORT_SYMBOL(tcp_check_req); 749EXPORT_SYMBOL(tcp_check_req);
@@ -719,6 +752,12 @@ EXPORT_SYMBOL(tcp_check_req);
719 * Queue segment on the new socket if the new socket is active, 752 * Queue segment on the new socket if the new socket is active,
720 * otherwise we just shortcircuit this and continue with 753 * otherwise we just shortcircuit this and continue with
721 * the new socket. 754 * the new socket.
755 *
756 * For the vast majority of cases child->sk_state will be TCP_SYN_RECV
757 * when entering. But other states are possible due to a race condition
758 * where after __inet_lookup_established() fails but before the listener
759 * locked is obtained, other packets cause the same connection to
760 * be created.
722 */ 761 */
723 762
724int tcp_child_process(struct sock *parent, struct sock *child, 763int tcp_child_process(struct sock *parent, struct sock *child,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index d04632673a9e..9383b51f3efc 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -702,7 +702,8 @@ static unsigned int tcp_synack_options(struct sock *sk,
702 unsigned int mss, struct sk_buff *skb, 702 unsigned int mss, struct sk_buff *skb,
703 struct tcp_out_options *opts, 703 struct tcp_out_options *opts,
704 struct tcp_md5sig_key **md5, 704 struct tcp_md5sig_key **md5,
705 struct tcp_extend_values *xvp) 705 struct tcp_extend_values *xvp,
706 struct tcp_fastopen_cookie *foc)
706{ 707{
707 struct inet_request_sock *ireq = inet_rsk(req); 708 struct inet_request_sock *ireq = inet_rsk(req);
708 unsigned int remaining = MAX_TCP_OPTION_SPACE; 709 unsigned int remaining = MAX_TCP_OPTION_SPACE;
@@ -747,7 +748,15 @@ static unsigned int tcp_synack_options(struct sock *sk,
747 if (unlikely(!ireq->tstamp_ok)) 748 if (unlikely(!ireq->tstamp_ok))
748 remaining -= TCPOLEN_SACKPERM_ALIGNED; 749 remaining -= TCPOLEN_SACKPERM_ALIGNED;
749 } 750 }
750 751 if (foc != NULL) {
752 u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
753 need = (need + 3) & ~3U; /* Align to 32 bits */
754 if (remaining >= need) {
755 opts->options |= OPTION_FAST_OPEN_COOKIE;
756 opts->fastopen_cookie = foc;
757 remaining -= need;
758 }
759 }
751 /* Similar rationale to tcp_syn_options() applies here, too. 760 /* Similar rationale to tcp_syn_options() applies here, too.
752 * If the <SYN> options fit, the same options should fit now! 761 * If the <SYN> options fit, the same options should fit now!
753 */ 762 */
@@ -2658,7 +2667,8 @@ int tcp_send_synack(struct sock *sk)
2658 */ 2667 */
2659struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, 2668struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2660 struct request_sock *req, 2669 struct request_sock *req,
2661 struct request_values *rvp) 2670 struct request_values *rvp,
2671 struct tcp_fastopen_cookie *foc)
2662{ 2672{
2663 struct tcp_out_options opts; 2673 struct tcp_out_options opts;
2664 struct tcp_extend_values *xvp = tcp_xv(rvp); 2674 struct tcp_extend_values *xvp = tcp_xv(rvp);
@@ -2718,7 +2728,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2718#endif 2728#endif
2719 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2729 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2720 tcp_header_size = tcp_synack_options(sk, req, mss, 2730 tcp_header_size = tcp_synack_options(sk, req, mss,
2721 skb, &opts, &md5, xvp) 2731 skb, &opts, &md5, xvp, foc)
2722 + sizeof(*th); 2732 + sizeof(*th);
2723 2733
2724 skb_push(skb, tcp_header_size); 2734 skb_push(skb, tcp_header_size);
@@ -2772,7 +2782,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2772 } 2782 }
2773 2783
2774 th->seq = htonl(TCP_SKB_CB(skb)->seq); 2784 th->seq = htonl(TCP_SKB_CB(skb)->seq);
2775 th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); 2785 /* XXX data is queued and acked as is. No buffer/window check */
2786 th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
2776 2787
2777 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ 2788 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
2778 th->window = htons(min(req->rcv_wnd, 65535U)); 2789 th->window = htons(min(req->rcv_wnd, 65535U));
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b774a03bd1dc..fc04711e80c8 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -305,6 +305,35 @@ static void tcp_probe_timer(struct sock *sk)
305} 305}
306 306
307/* 307/*
308 * Timer for Fast Open socket to retransmit SYNACK. Note that the
309 * sk here is the child socket, not the parent (listener) socket.
310 */
311static void tcp_fastopen_synack_timer(struct sock *sk)
312{
313 struct inet_connection_sock *icsk = inet_csk(sk);
314 int max_retries = icsk->icsk_syn_retries ? :
315 sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
316 struct request_sock *req;
317
318 req = tcp_sk(sk)->fastopen_rsk;
319 req->rsk_ops->syn_ack_timeout(sk, req);
320
321 if (req->retrans >= max_retries) {
322 tcp_write_err(sk);
323 return;
324 }
325 /* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error
326 * returned from rtx_syn_ack() to make it more persistent like
327 * regular retransmit because if the child socket has been accepted
328 * it's not good to give up too easily.
329 */
330 req->rsk_ops->rtx_syn_ack(sk, req, NULL);
331 req->retrans++;
332 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
333 TCP_TIMEOUT_INIT << req->retrans, TCP_RTO_MAX);
334}
335
336/*
308 * The TCP retransmit timer. 337 * The TCP retransmit timer.
309 */ 338 */
310 339
@@ -317,7 +346,15 @@ void tcp_retransmit_timer(struct sock *sk)
317 tcp_resume_early_retransmit(sk); 346 tcp_resume_early_retransmit(sk);
318 return; 347 return;
319 } 348 }
320 349 if (tp->fastopen_rsk) {
350 BUG_ON(sk->sk_state != TCP_SYN_RECV &&
351 sk->sk_state != TCP_FIN_WAIT1);
352 tcp_fastopen_synack_timer(sk);
353 /* Before we receive ACK to our SYN-ACK don't retransmit
354 * anything else (e.g., data or FIN segments).
355 */
356 return;
357 }
321 if (!tp->packets_out) 358 if (!tp->packets_out)
322 goto out; 359 goto out;
323 360
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index bb46061c813a..182ab9a85d6c 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -190,6 +190,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
190 ireq = inet_rsk(req); 190 ireq = inet_rsk(req);
191 ireq6 = inet6_rsk(req); 191 ireq6 = inet6_rsk(req);
192 treq = tcp_rsk(req); 192 treq = tcp_rsk(req);
193 treq->listener = NULL;
193 194
194 if (security_inet_conn_request(sk, skb, req)) 195 if (security_inet_conn_request(sk, skb, req))
195 goto out_free; 196 goto out_free;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index f99b81d53cca..09078b9bc6f6 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -475,7 +475,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
475 if (!dst && (dst = inet6_csk_route_req(sk, fl6, req)) == NULL) 475 if (!dst && (dst = inet6_csk_route_req(sk, fl6, req)) == NULL)
476 goto done; 476 goto done;
477 477
478 skb = tcp_make_synack(sk, dst, req, rvp); 478 skb = tcp_make_synack(sk, dst, req, rvp, NULL);
479 479
480 if (skb) { 480 if (skb) {
481 __tcp_v6_send_check(skb, &treq->loc_addr, &treq->rmt_addr); 481 __tcp_v6_send_check(skb, &treq->loc_addr, &treq->rmt_addr);
@@ -987,7 +987,7 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
987 &ipv6_hdr(skb)->saddr, 987 &ipv6_hdr(skb)->saddr,
988 &ipv6_hdr(skb)->daddr, inet6_iif(skb)); 988 &ipv6_hdr(skb)->daddr, inet6_iif(skb));
989 if (req) 989 if (req)
990 return tcp_check_req(sk, skb, req, prev); 990 return tcp_check_req(sk, skb, req, prev, false);
991 991
992 nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo, 992 nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo,
993 &ipv6_hdr(skb)->saddr, th->source, 993 &ipv6_hdr(skb)->saddr, th->source,
@@ -1179,6 +1179,7 @@ have_isn:
1179 want_cookie) 1179 want_cookie)
1180 goto drop_and_free; 1180 goto drop_and_free;
1181 1181
1182 tcp_rsk(req)->listener = NULL;
1182 inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); 1183 inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1183 return 0; 1184 return 0;
1184 1185