aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorPablo Neira Ayuso <pablo@netfilter.org>2012-09-03 09:28:30 -0400
committerPablo Neira Ayuso <pablo@netfilter.org>2012-09-03 09:34:51 -0400
commitace1fe1231bdfffd60b5e703aa5b7283fbf98dbd (patch)
tree06c7492a8f3cc65f916768616ca24c6bc7171761 /net/ipv4
parentce9f3f31efb88841e4df98794b13dbac8c4901da (diff)
parenta2dc375e12334b3d8f787a48b2fb6172ccfb80ae (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
This merges (3f509c6 netfilter: nf_nat_sip: fix incorrect handling of EBUSY for RTCP expectation) to Patrick McHardy's IPv6 NAT changes.
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c28
-rw-r--r--net/ipv4/devinet.c6
-rw-r--r--net/ipv4/fib_frontend.c7
-rw-r--r--net/ipv4/inet_connection_sock.c57
-rw-r--r--net/ipv4/inet_diag.c21
-rw-r--r--net/ipv4/ipmr.c14
-rw-r--r--net/ipv4/ping.c22
-rw-r--r--net/ipv4/proc.c4
-rw-r--r--net/ipv4/raw.c4
-rw-r--r--net/ipv4/route.c11
-rw-r--r--net/ipv4/syncookies.c1
-rw-r--r--net/ipv4/sysctl_net_ipv4.c87
-rw-r--r--net/ipv4/tcp.c49
-rw-r--r--net/ipv4/tcp_fastopen.c83
-rw-r--r--net/ipv4/tcp_input.c90
-rw-r--r--net/ipv4/tcp_ipv4.c275
-rw-r--r--net/ipv4/tcp_minisocks.c61
-rw-r--r--net/ipv4/tcp_output.c21
-rw-r--r--net/ipv4/tcp_timer.c39
-rw-r--r--net/ipv4/udp.c4
-rw-r--r--net/ipv4/udp_diag.c5
21 files changed, 760 insertions, 129 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 6681ccf5c3ee..4f70ef0b946d 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -149,6 +149,11 @@ void inet_sock_destruct(struct sock *sk)
149 pr_err("Attempt to release alive inet socket %p\n", sk); 149 pr_err("Attempt to release alive inet socket %p\n", sk);
150 return; 150 return;
151 } 151 }
152 if (sk->sk_type == SOCK_STREAM) {
153 struct fastopen_queue *fastopenq =
154 inet_csk(sk)->icsk_accept_queue.fastopenq;
155 kfree(fastopenq);
156 }
152 157
153 WARN_ON(atomic_read(&sk->sk_rmem_alloc)); 158 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
154 WARN_ON(atomic_read(&sk->sk_wmem_alloc)); 159 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
@@ -212,6 +217,26 @@ int inet_listen(struct socket *sock, int backlog)
212 * we can only allow the backlog to be adjusted. 217 * we can only allow the backlog to be adjusted.
213 */ 218 */
214 if (old_state != TCP_LISTEN) { 219 if (old_state != TCP_LISTEN) {
220 /* Check special setups for testing purpose to enable TFO w/o
221 * requiring TCP_FASTOPEN sockopt.
222 * Note that only TCP sockets (SOCK_STREAM) will reach here.
223 * Also fastopenq may already been allocated because this
224 * socket was in TCP_LISTEN state previously but was
225 * shutdown() (rather than close()).
226 */
227 if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 &&
228 inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) {
229 if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0)
230 err = fastopen_init_queue(sk, backlog);
231 else if ((sysctl_tcp_fastopen &
232 TFO_SERVER_WO_SOCKOPT2) != 0)
233 err = fastopen_init_queue(sk,
234 ((uint)sysctl_tcp_fastopen) >> 16);
235 else
236 err = 0;
237 if (err)
238 goto out;
239 }
215 err = inet_csk_listen_start(sk, backlog); 240 err = inet_csk_listen_start(sk, backlog);
216 if (err) 241 if (err)
217 goto out; 242 goto out;
@@ -701,7 +726,8 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)
701 726
702 sock_rps_record_flow(sk2); 727 sock_rps_record_flow(sk2);
703 WARN_ON(!((1 << sk2->sk_state) & 728 WARN_ON(!((1 << sk2->sk_state) &
704 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE))); 729 (TCPF_ESTABLISHED | TCPF_SYN_RECV |
730 TCPF_CLOSE_WAIT | TCPF_CLOSE)));
705 731
706 sock_graft(sk2, newsock); 732 sock_graft(sk2, newsock);
707 733
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 6a5e6e4b142c..adf273f8ad2e 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1147,12 +1147,8 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
1147 void *ptr) 1147 void *ptr)
1148{ 1148{
1149 struct net_device *dev = ptr; 1149 struct net_device *dev = ptr;
1150 struct in_device *in_dev; 1150 struct in_device *in_dev = __in_dev_get_rtnl(dev);
1151
1152 if (event == NETDEV_UNREGISTER_FINAL)
1153 goto out;
1154 1151
1155 in_dev = __in_dev_get_rtnl(dev);
1156 ASSERT_RTNL(); 1152 ASSERT_RTNL();
1157 1153
1158 if (!in_dev) { 1154 if (!in_dev) {
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index fd7d9ae64f16..acdee325d972 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1050,9 +1050,6 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
1050 return NOTIFY_DONE; 1050 return NOTIFY_DONE;
1051 } 1051 }
1052 1052
1053 if (event == NETDEV_UNREGISTER_FINAL)
1054 return NOTIFY_DONE;
1055
1056 in_dev = __in_dev_get_rtnl(dev); 1053 in_dev = __in_dev_get_rtnl(dev);
1057 1054
1058 switch (event) { 1055 switch (event) {
@@ -1064,14 +1061,14 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
1064 fib_sync_up(dev); 1061 fib_sync_up(dev);
1065#endif 1062#endif
1066 atomic_inc(&net->ipv4.dev_addr_genid); 1063 atomic_inc(&net->ipv4.dev_addr_genid);
1067 rt_cache_flush(dev_net(dev), -1); 1064 rt_cache_flush(net, -1);
1068 break; 1065 break;
1069 case NETDEV_DOWN: 1066 case NETDEV_DOWN:
1070 fib_disable_ip(dev, 0, 0); 1067 fib_disable_ip(dev, 0, 0);
1071 break; 1068 break;
1072 case NETDEV_CHANGEMTU: 1069 case NETDEV_CHANGEMTU:
1073 case NETDEV_CHANGE: 1070 case NETDEV_CHANGE:
1074 rt_cache_flush(dev_net(dev), 0); 1071 rt_cache_flush(net, 0);
1075 break; 1072 break;
1076 } 1073 }
1077 return NOTIFY_DONE; 1074 return NOTIFY_DONE;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 7f75f21d7b83..8464b79c493f 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -283,7 +283,9 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
283struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) 283struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
284{ 284{
285 struct inet_connection_sock *icsk = inet_csk(sk); 285 struct inet_connection_sock *icsk = inet_csk(sk);
286 struct request_sock_queue *queue = &icsk->icsk_accept_queue;
286 struct sock *newsk; 287 struct sock *newsk;
288 struct request_sock *req;
287 int error; 289 int error;
288 290
289 lock_sock(sk); 291 lock_sock(sk);
@@ -296,7 +298,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
296 goto out_err; 298 goto out_err;
297 299
298 /* Find already established connection */ 300 /* Find already established connection */
299 if (reqsk_queue_empty(&icsk->icsk_accept_queue)) { 301 if (reqsk_queue_empty(queue)) {
300 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); 302 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
301 303
302 /* If this is a non blocking socket don't sleep */ 304 /* If this is a non blocking socket don't sleep */
@@ -308,14 +310,32 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
308 if (error) 310 if (error)
309 goto out_err; 311 goto out_err;
310 } 312 }
311 313 req = reqsk_queue_remove(queue);
312 newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk); 314 newsk = req->sk;
313 WARN_ON(newsk->sk_state == TCP_SYN_RECV); 315
316 sk_acceptq_removed(sk);
317 if (sk->sk_type == SOCK_STREAM && queue->fastopenq != NULL) {
318 spin_lock_bh(&queue->fastopenq->lock);
319 if (tcp_rsk(req)->listener) {
320 /* We are still waiting for the final ACK from 3WHS
321 * so can't free req now. Instead, we set req->sk to
322 * NULL to signify that the child socket is taken
323 * so reqsk_fastopen_remove() will free the req
324 * when 3WHS finishes (or is aborted).
325 */
326 req->sk = NULL;
327 req = NULL;
328 }
329 spin_unlock_bh(&queue->fastopenq->lock);
330 }
314out: 331out:
315 release_sock(sk); 332 release_sock(sk);
333 if (req)
334 __reqsk_free(req);
316 return newsk; 335 return newsk;
317out_err: 336out_err:
318 newsk = NULL; 337 newsk = NULL;
338 req = NULL;
319 *err = error; 339 *err = error;
320 goto out; 340 goto out;
321} 341}
@@ -720,13 +740,14 @@ EXPORT_SYMBOL_GPL(inet_csk_listen_start);
720void inet_csk_listen_stop(struct sock *sk) 740void inet_csk_listen_stop(struct sock *sk)
721{ 741{
722 struct inet_connection_sock *icsk = inet_csk(sk); 742 struct inet_connection_sock *icsk = inet_csk(sk);
743 struct request_sock_queue *queue = &icsk->icsk_accept_queue;
723 struct request_sock *acc_req; 744 struct request_sock *acc_req;
724 struct request_sock *req; 745 struct request_sock *req;
725 746
726 inet_csk_delete_keepalive_timer(sk); 747 inet_csk_delete_keepalive_timer(sk);
727 748
728 /* make all the listen_opt local to us */ 749 /* make all the listen_opt local to us */
729 acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue); 750 acc_req = reqsk_queue_yank_acceptq(queue);
730 751
731 /* Following specs, it would be better either to send FIN 752 /* Following specs, it would be better either to send FIN
732 * (and enter FIN-WAIT-1, it is normal close) 753 * (and enter FIN-WAIT-1, it is normal close)
@@ -736,7 +757,7 @@ void inet_csk_listen_stop(struct sock *sk)
736 * To be honest, we are not able to make either 757 * To be honest, we are not able to make either
737 * of the variants now. --ANK 758 * of the variants now. --ANK
738 */ 759 */
739 reqsk_queue_destroy(&icsk->icsk_accept_queue); 760 reqsk_queue_destroy(queue);
740 761
741 while ((req = acc_req) != NULL) { 762 while ((req = acc_req) != NULL) {
742 struct sock *child = req->sk; 763 struct sock *child = req->sk;
@@ -754,6 +775,19 @@ void inet_csk_listen_stop(struct sock *sk)
754 775
755 percpu_counter_inc(sk->sk_prot->orphan_count); 776 percpu_counter_inc(sk->sk_prot->orphan_count);
756 777
778 if (sk->sk_type == SOCK_STREAM && tcp_rsk(req)->listener) {
779 BUG_ON(tcp_sk(child)->fastopen_rsk != req);
780 BUG_ON(sk != tcp_rsk(req)->listener);
781
782 /* Paranoid, to prevent race condition if
783 * an inbound pkt destined for child is
784 * blocked by sock lock in tcp_v4_rcv().
785 * Also to satisfy an assertion in
786 * tcp_v4_destroy_sock().
787 */
788 tcp_sk(child)->fastopen_rsk = NULL;
789 sock_put(sk);
790 }
757 inet_csk_destroy_sock(child); 791 inet_csk_destroy_sock(child);
758 792
759 bh_unlock_sock(child); 793 bh_unlock_sock(child);
@@ -763,6 +797,17 @@ void inet_csk_listen_stop(struct sock *sk)
763 sk_acceptq_removed(sk); 797 sk_acceptq_removed(sk);
764 __reqsk_free(req); 798 __reqsk_free(req);
765 } 799 }
800 if (queue->fastopenq != NULL) {
801 /* Free all the reqs queued in rskq_rst_head. */
802 spin_lock_bh(&queue->fastopenq->lock);
803 acc_req = queue->fastopenq->rskq_rst_head;
804 queue->fastopenq->rskq_rst_head = NULL;
805 spin_unlock_bh(&queue->fastopenq->lock);
806 while ((req = acc_req) != NULL) {
807 acc_req = req->dl_next;
808 __reqsk_free(req);
809 }
810 }
766 WARN_ON(sk->sk_ack_backlog); 811 WARN_ON(sk->sk_ack_backlog);
767} 812}
768EXPORT_SYMBOL_GPL(inet_csk_listen_stop); 813EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 570e61f9611f..8bc005b1435f 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -69,6 +69,7 @@ static inline void inet_diag_unlock_handler(
69 69
70int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, 70int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
71 struct sk_buff *skb, struct inet_diag_req_v2 *req, 71 struct sk_buff *skb, struct inet_diag_req_v2 *req,
72 struct user_namespace *user_ns,
72 u32 pid, u32 seq, u16 nlmsg_flags, 73 u32 pid, u32 seq, u16 nlmsg_flags,
73 const struct nlmsghdr *unlh) 74 const struct nlmsghdr *unlh)
74{ 75{
@@ -124,7 +125,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
124 } 125 }
125#endif 126#endif
126 127
127 r->idiag_uid = sock_i_uid(sk); 128 r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
128 r->idiag_inode = sock_i_ino(sk); 129 r->idiag_inode = sock_i_ino(sk);
129 130
130 if (ext & (1 << (INET_DIAG_MEMINFO - 1))) { 131 if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
@@ -199,11 +200,12 @@ EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
199 200
200static int inet_csk_diag_fill(struct sock *sk, 201static int inet_csk_diag_fill(struct sock *sk,
201 struct sk_buff *skb, struct inet_diag_req_v2 *req, 202 struct sk_buff *skb, struct inet_diag_req_v2 *req,
203 struct user_namespace *user_ns,
202 u32 pid, u32 seq, u16 nlmsg_flags, 204 u32 pid, u32 seq, u16 nlmsg_flags,
203 const struct nlmsghdr *unlh) 205 const struct nlmsghdr *unlh)
204{ 206{
205 return inet_sk_diag_fill(sk, inet_csk(sk), 207 return inet_sk_diag_fill(sk, inet_csk(sk),
206 skb, req, pid, seq, nlmsg_flags, unlh); 208 skb, req, user_ns, pid, seq, nlmsg_flags, unlh);
207} 209}
208 210
209static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, 211static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
@@ -256,14 +258,16 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
256} 258}
257 259
258static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, 260static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
259 struct inet_diag_req_v2 *r, u32 pid, u32 seq, u16 nlmsg_flags, 261 struct inet_diag_req_v2 *r,
262 struct user_namespace *user_ns,
263 u32 pid, u32 seq, u16 nlmsg_flags,
260 const struct nlmsghdr *unlh) 264 const struct nlmsghdr *unlh)
261{ 265{
262 if (sk->sk_state == TCP_TIME_WAIT) 266 if (sk->sk_state == TCP_TIME_WAIT)
263 return inet_twsk_diag_fill((struct inet_timewait_sock *)sk, 267 return inet_twsk_diag_fill((struct inet_timewait_sock *)sk,
264 skb, r, pid, seq, nlmsg_flags, 268 skb, r, pid, seq, nlmsg_flags,
265 unlh); 269 unlh);
266 return inet_csk_diag_fill(sk, skb, r, pid, seq, nlmsg_flags, unlh); 270 return inet_csk_diag_fill(sk, skb, r, user_ns, pid, seq, nlmsg_flags, unlh);
267} 271}
268 272
269int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb, 273int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb,
@@ -311,6 +315,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s
311 } 315 }
312 316
313 err = sk_diag_fill(sk, rep, req, 317 err = sk_diag_fill(sk, rep, req,
318 sk_user_ns(NETLINK_CB(in_skb).ssk),
314 NETLINK_CB(in_skb).pid, 319 NETLINK_CB(in_skb).pid,
315 nlh->nlmsg_seq, 0, nlh); 320 nlh->nlmsg_seq, 0, nlh);
316 if (err < 0) { 321 if (err < 0) {
@@ -551,6 +556,7 @@ static int inet_csk_diag_dump(struct sock *sk,
551 return 0; 556 return 0;
552 557
553 return inet_csk_diag_fill(sk, skb, r, 558 return inet_csk_diag_fill(sk, skb, r,
559 sk_user_ns(NETLINK_CB(cb->skb).ssk),
554 NETLINK_CB(cb->skb).pid, 560 NETLINK_CB(cb->skb).pid,
555 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); 561 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
556} 562}
@@ -591,7 +597,9 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,
591} 597}
592 598
593static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, 599static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
594 struct request_sock *req, u32 pid, u32 seq, 600 struct request_sock *req,
601 struct user_namespace *user_ns,
602 u32 pid, u32 seq,
595 const struct nlmsghdr *unlh) 603 const struct nlmsghdr *unlh)
596{ 604{
597 const struct inet_request_sock *ireq = inet_rsk(req); 605 const struct inet_request_sock *ireq = inet_rsk(req);
@@ -625,7 +633,7 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
625 r->idiag_expires = jiffies_to_msecs(tmo); 633 r->idiag_expires = jiffies_to_msecs(tmo);
626 r->idiag_rqueue = 0; 634 r->idiag_rqueue = 0;
627 r->idiag_wqueue = 0; 635 r->idiag_wqueue = 0;
628 r->idiag_uid = sock_i_uid(sk); 636 r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
629 r->idiag_inode = 0; 637 r->idiag_inode = 0;
630#if IS_ENABLED(CONFIG_IPV6) 638#if IS_ENABLED(CONFIG_IPV6)
631 if (r->idiag_family == AF_INET6) { 639 if (r->idiag_family == AF_INET6) {
@@ -702,6 +710,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
702 } 710 }
703 711
704 err = inet_diag_fill_req(skb, sk, req, 712 err = inet_diag_fill_req(skb, sk, req,
713 sk_user_ns(NETLINK_CB(cb->skb).ssk),
705 NETLINK_CB(cb->skb).pid, 714 NETLINK_CB(cb->skb).pid,
706 cb->nlh->nlmsg_seq, cb->nlh); 715 cb->nlh->nlmsg_seq, cb->nlh);
707 if (err < 0) { 716 if (err < 0) {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 3a57570c8ee5..8aa7a4cf9139 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -124,6 +124,8 @@ static DEFINE_SPINLOCK(mfc_unres_lock);
124static struct kmem_cache *mrt_cachep __read_mostly; 124static struct kmem_cache *mrt_cachep __read_mostly;
125 125
126static struct mr_table *ipmr_new_table(struct net *net, u32 id); 126static struct mr_table *ipmr_new_table(struct net *net, u32 id);
127static void ipmr_free_table(struct mr_table *mrt);
128
127static int ip_mr_forward(struct net *net, struct mr_table *mrt, 129static int ip_mr_forward(struct net *net, struct mr_table *mrt,
128 struct sk_buff *skb, struct mfc_cache *cache, 130 struct sk_buff *skb, struct mfc_cache *cache,
129 int local); 131 int local);
@@ -131,6 +133,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
131 struct sk_buff *pkt, vifi_t vifi, int assert); 133 struct sk_buff *pkt, vifi_t vifi, int assert);
132static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, 134static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
133 struct mfc_cache *c, struct rtmsg *rtm); 135 struct mfc_cache *c, struct rtmsg *rtm);
136static void mroute_clean_tables(struct mr_table *mrt);
134static void ipmr_expire_process(unsigned long arg); 137static void ipmr_expire_process(unsigned long arg);
135 138
136#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES 139#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
@@ -271,7 +274,7 @@ static void __net_exit ipmr_rules_exit(struct net *net)
271 274
272 list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) { 275 list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
273 list_del(&mrt->list); 276 list_del(&mrt->list);
274 kfree(mrt); 277 ipmr_free_table(mrt);
275 } 278 }
276 fib_rules_unregister(net->ipv4.mr_rules_ops); 279 fib_rules_unregister(net->ipv4.mr_rules_ops);
277} 280}
@@ -299,7 +302,7 @@ static int __net_init ipmr_rules_init(struct net *net)
299 302
300static void __net_exit ipmr_rules_exit(struct net *net) 303static void __net_exit ipmr_rules_exit(struct net *net)
301{ 304{
302 kfree(net->ipv4.mrt); 305 ipmr_free_table(net->ipv4.mrt);
303} 306}
304#endif 307#endif
305 308
@@ -336,6 +339,13 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
336 return mrt; 339 return mrt;
337} 340}
338 341
342static void ipmr_free_table(struct mr_table *mrt)
343{
344 del_timer_sync(&mrt->ipmr_expire_timer);
345 mroute_clean_tables(mrt);
346 kfree(mrt);
347}
348
339/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ 349/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
340 350
341static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v) 351static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 6232d476f37e..8f3d05424a3e 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -185,10 +185,10 @@ exit:
185 return sk; 185 return sk;
186} 186}
187 187
188static void inet_get_ping_group_range_net(struct net *net, gid_t *low, 188static void inet_get_ping_group_range_net(struct net *net, kgid_t *low,
189 gid_t *high) 189 kgid_t *high)
190{ 190{
191 gid_t *data = net->ipv4.sysctl_ping_group_range; 191 kgid_t *data = net->ipv4.sysctl_ping_group_range;
192 unsigned int seq; 192 unsigned int seq;
193 193
194 do { 194 do {
@@ -203,19 +203,13 @@ static void inet_get_ping_group_range_net(struct net *net, gid_t *low,
203static int ping_init_sock(struct sock *sk) 203static int ping_init_sock(struct sock *sk)
204{ 204{
205 struct net *net = sock_net(sk); 205 struct net *net = sock_net(sk);
206 gid_t group = current_egid(); 206 kgid_t group = current_egid();
207 gid_t range[2];
208 struct group_info *group_info = get_current_groups(); 207 struct group_info *group_info = get_current_groups();
209 int i, j, count = group_info->ngroups; 208 int i, j, count = group_info->ngroups;
210 kgid_t low, high; 209 kgid_t low, high;
211 210
212 inet_get_ping_group_range_net(net, range, range+1); 211 inet_get_ping_group_range_net(net, &low, &high);
213 low = make_kgid(&init_user_ns, range[0]); 212 if (gid_lte(low, group) && gid_lte(group, high))
214 high = make_kgid(&init_user_ns, range[1]);
215 if (!gid_valid(low) || !gid_valid(high) || gid_lt(high, low))
216 return -EACCES;
217
218 if (range[0] <= group && group <= range[1])
219 return 0; 213 return 0;
220 214
221 for (i = 0; i < group_info->nblocks; i++) { 215 for (i = 0; i < group_info->nblocks; i++) {
@@ -845,7 +839,9 @@ static void ping_format_sock(struct sock *sp, struct seq_file *f,
845 bucket, src, srcp, dest, destp, sp->sk_state, 839 bucket, src, srcp, dest, destp, sp->sk_state,
846 sk_wmem_alloc_get(sp), 840 sk_wmem_alloc_get(sp),
847 sk_rmem_alloc_get(sp), 841 sk_rmem_alloc_get(sp),
848 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), 842 0, 0L, 0,
843 from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
844 0, sock_i_ino(sp),
849 atomic_read(&sp->sk_refcnt), sp, 845 atomic_read(&sp->sk_refcnt), sp,
850 atomic_read(&sp->sk_drops), len); 846 atomic_read(&sp->sk_drops), len);
851} 847}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 957acd12250b..8de53e1ddd54 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -263,6 +263,10 @@ static const struct snmp_mib snmp4_net_list[] = {
263 SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK), 263 SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK),
264 SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE), 264 SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE),
265 SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE), 265 SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE),
266 SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE),
267 SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL),
268 SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW),
269 SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD),
266 SNMP_MIB_SENTINEL 270 SNMP_MIB_SENTINEL
267}; 271};
268 272
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index ff0f071969ea..f2425785d40a 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -992,7 +992,9 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
992 i, src, srcp, dest, destp, sp->sk_state, 992 i, src, srcp, dest, destp, sp->sk_state,
993 sk_wmem_alloc_get(sp), 993 sk_wmem_alloc_get(sp),
994 sk_rmem_alloc_get(sp), 994 sk_rmem_alloc_get(sp),
995 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), 995 0, 0L, 0,
996 from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)),
997 0, sock_i_ino(sp),
996 atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); 998 atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops));
997} 999}
998 1000
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 50f6d3adb474..dc9549b5eb1c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -934,12 +934,14 @@ static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
934 if (mtu < ip_rt_min_pmtu) 934 if (mtu < ip_rt_min_pmtu)
935 mtu = ip_rt_min_pmtu; 935 mtu = ip_rt_min_pmtu;
936 936
937 rcu_read_lock();
937 if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) { 938 if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
938 struct fib_nh *nh = &FIB_RES_NH(res); 939 struct fib_nh *nh = &FIB_RES_NH(res);
939 940
940 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, 941 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
941 jiffies + ip_rt_mtu_expires); 942 jiffies + ip_rt_mtu_expires);
942 } 943 }
944 rcu_read_unlock();
943 return mtu; 945 return mtu;
944} 946}
945 947
@@ -956,7 +958,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
956 dst->obsolete = DST_OBSOLETE_KILL; 958 dst->obsolete = DST_OBSOLETE_KILL;
957 } else { 959 } else {
958 rt->rt_pmtu = mtu; 960 rt->rt_pmtu = mtu;
959 dst_set_expires(&rt->dst, ip_rt_mtu_expires); 961 rt->dst.expires = max(1UL, jiffies + ip_rt_mtu_expires);
960 } 962 }
961} 963}
962 964
@@ -1132,10 +1134,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
1132 const struct rtable *rt = (const struct rtable *) dst; 1134 const struct rtable *rt = (const struct rtable *) dst;
1133 unsigned int mtu = rt->rt_pmtu; 1135 unsigned int mtu = rt->rt_pmtu;
1134 1136
1135 if (mtu && time_after_eq(jiffies, rt->dst.expires)) 1137 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1136 mtu = 0;
1137
1138 if (!mtu)
1139 mtu = dst_metric_raw(dst, RTAX_MTU); 1138 mtu = dst_metric_raw(dst, RTAX_MTU);
1140 1139
1141 if (mtu && rt_is_output_route(rt)) 1140 if (mtu && rt_is_output_route(rt))
@@ -1263,7 +1262,7 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
1263{ 1262{
1264 struct rtable *rt = (struct rtable *) dst; 1263 struct rtable *rt = (struct rtable *) dst;
1265 1264
1266 if (dst->flags & DST_NOCACHE) { 1265 if (!list_empty(&rt->rt_uncached)) {
1267 spin_lock_bh(&rt_uncached_lock); 1266 spin_lock_bh(&rt_uncached_lock);
1268 list_del(&rt->rt_uncached); 1267 list_del(&rt->rt_uncached);
1269 spin_unlock_bh(&rt_uncached_lock); 1268 spin_unlock_bh(&rt_uncached_lock);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 650e1528e1e6..ba48e799b031 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -319,6 +319,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
319 ireq->tstamp_ok = tcp_opt.saw_tstamp; 319 ireq->tstamp_ok = tcp_opt.saw_tstamp;
320 req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; 320 req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
321 treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; 321 treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
322 treq->listener = NULL;
322 323
323 /* We throwed the options of the initial SYN away, so we hope 324 /* We throwed the options of the initial SYN away, so we hope
324 * the ACK carries the same options again (see RFC1122 4.2.3.8) 325 * the ACK carries the same options again (see RFC1122 4.2.3.8)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 1b5ce96707a3..9205e492dc9d 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -76,9 +76,9 @@ static int ipv4_local_port_range(ctl_table *table, int write,
76} 76}
77 77
78 78
79static void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low, gid_t *high) 79static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high)
80{ 80{
81 gid_t *data = table->data; 81 kgid_t *data = table->data;
82 unsigned int seq; 82 unsigned int seq;
83 do { 83 do {
84 seq = read_seqbegin(&sysctl_local_ports.lock); 84 seq = read_seqbegin(&sysctl_local_ports.lock);
@@ -89,12 +89,12 @@ static void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low,
89} 89}
90 90
91/* Update system visible IP port range */ 91/* Update system visible IP port range */
92static void set_ping_group_range(struct ctl_table *table, gid_t range[2]) 92static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t high)
93{ 93{
94 gid_t *data = table->data; 94 kgid_t *data = table->data;
95 write_seqlock(&sysctl_local_ports.lock); 95 write_seqlock(&sysctl_local_ports.lock);
96 data[0] = range[0]; 96 data[0] = low;
97 data[1] = range[1]; 97 data[1] = high;
98 write_sequnlock(&sysctl_local_ports.lock); 98 write_sequnlock(&sysctl_local_ports.lock);
99} 99}
100 100
@@ -103,21 +103,33 @@ static int ipv4_ping_group_range(ctl_table *table, int write,
103 void __user *buffer, 103 void __user *buffer,
104 size_t *lenp, loff_t *ppos) 104 size_t *lenp, loff_t *ppos)
105{ 105{
106 struct user_namespace *user_ns = current_user_ns();
106 int ret; 107 int ret;
107 gid_t range[2]; 108 gid_t urange[2];
109 kgid_t low, high;
108 ctl_table tmp = { 110 ctl_table tmp = {
109 .data = &range, 111 .data = &urange,
110 .maxlen = sizeof(range), 112 .maxlen = sizeof(urange),
111 .mode = table->mode, 113 .mode = table->mode,
112 .extra1 = &ip_ping_group_range_min, 114 .extra1 = &ip_ping_group_range_min,
113 .extra2 = &ip_ping_group_range_max, 115 .extra2 = &ip_ping_group_range_max,
114 }; 116 };
115 117
116 inet_get_ping_group_range_table(table, range, range + 1); 118 inet_get_ping_group_range_table(table, &low, &high);
119 urange[0] = from_kgid_munged(user_ns, low);
120 urange[1] = from_kgid_munged(user_ns, high);
117 ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); 121 ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
118 122
119 if (write && ret == 0) 123 if (write && ret == 0) {
120 set_ping_group_range(table, range); 124 low = make_kgid(user_ns, urange[0]);
125 high = make_kgid(user_ns, urange[1]);
126 if (!gid_valid(low) || !gid_valid(high) ||
127 (urange[1] < urange[0]) || gid_lt(high, low)) {
128 low = make_kgid(&init_user_ns, 1);
129 high = make_kgid(&init_user_ns, 0);
130 }
131 set_ping_group_range(table, low, high);
132 }
121 133
122 return ret; 134 return ret;
123} 135}
@@ -220,6 +232,45 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write,
220 return 0; 232 return 0;
221} 233}
222 234
235int proc_tcp_fastopen_key(ctl_table *ctl, int write, void __user *buffer,
236 size_t *lenp, loff_t *ppos)
237{
238 ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) };
239 struct tcp_fastopen_context *ctxt;
240 int ret;
241 u32 user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */
242
243 tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL);
244 if (!tbl.data)
245 return -ENOMEM;
246
247 rcu_read_lock();
248 ctxt = rcu_dereference(tcp_fastopen_ctx);
249 if (ctxt)
250 memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH);
251 rcu_read_unlock();
252
253 snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x",
254 user_key[0], user_key[1], user_key[2], user_key[3]);
255 ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
256
257 if (write && ret == 0) {
258 if (sscanf(tbl.data, "%x-%x-%x-%x", user_key, user_key + 1,
259 user_key + 2, user_key + 3) != 4) {
260 ret = -EINVAL;
261 goto bad_key;
262 }
263 tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH);
264 }
265
266bad_key:
267 pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n",
268 user_key[0], user_key[1], user_key[2], user_key[3],
269 (char *)tbl.data, ret);
270 kfree(tbl.data);
271 return ret;
272}
273
223static struct ctl_table ipv4_table[] = { 274static struct ctl_table ipv4_table[] = {
224 { 275 {
225 .procname = "tcp_timestamps", 276 .procname = "tcp_timestamps",
@@ -374,6 +425,12 @@ static struct ctl_table ipv4_table[] = {
374 .proc_handler = proc_dointvec, 425 .proc_handler = proc_dointvec,
375 }, 426 },
376 { 427 {
428 .procname = "tcp_fastopen_key",
429 .mode = 0600,
430 .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10),
431 .proc_handler = proc_tcp_fastopen_key,
432 },
433 {
377 .procname = "tcp_tw_recycle", 434 .procname = "tcp_tw_recycle",
378 .data = &tcp_death_row.sysctl_tw_recycle, 435 .data = &tcp_death_row.sysctl_tw_recycle,
379 .maxlen = sizeof(int), 436 .maxlen = sizeof(int),
@@ -786,7 +843,7 @@ static struct ctl_table ipv4_net_table[] = {
786 { 843 {
787 .procname = "ping_group_range", 844 .procname = "ping_group_range",
788 .data = &init_net.ipv4.sysctl_ping_group_range, 845 .data = &init_net.ipv4.sysctl_ping_group_range,
789 .maxlen = sizeof(init_net.ipv4.sysctl_ping_group_range), 846 .maxlen = sizeof(gid_t)*2,
790 .mode = 0644, 847 .mode = 0644,
791 .proc_handler = ipv4_ping_group_range, 848 .proc_handler = ipv4_ping_group_range,
792 }, 849 },
@@ -830,8 +887,8 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
830 * Sane defaults - nobody may create ping sockets. 887 * Sane defaults - nobody may create ping sockets.
831 * Boot scripts should set this to distro-specific group. 888 * Boot scripts should set this to distro-specific group.
832 */ 889 */
833 net->ipv4.sysctl_ping_group_range[0] = 1; 890 net->ipv4.sysctl_ping_group_range[0] = make_kgid(&init_user_ns, 1);
834 net->ipv4.sysctl_ping_group_range[1] = 0; 891 net->ipv4.sysctl_ping_group_range[1] = make_kgid(&init_user_ns, 0);
835 892
836 tcp_init_mem(net); 893 tcp_init_mem(net);
837 894
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2109ff4a1daf..df83d744e380 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -486,8 +486,9 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
486 if (sk->sk_shutdown & RCV_SHUTDOWN) 486 if (sk->sk_shutdown & RCV_SHUTDOWN)
487 mask |= POLLIN | POLLRDNORM | POLLRDHUP; 487 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
488 488
489 /* Connected? */ 489 /* Connected or passive Fast Open socket? */
490 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { 490 if (sk->sk_state != TCP_SYN_SENT &&
491 (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) {
491 int target = sock_rcvlowat(sk, 0, INT_MAX); 492 int target = sock_rcvlowat(sk, 0, INT_MAX);
492 493
493 if (tp->urg_seq == tp->copied_seq && 494 if (tp->urg_seq == tp->copied_seq &&
@@ -840,10 +841,15 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
840 ssize_t copied; 841 ssize_t copied;
841 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 842 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
842 843
843 /* Wait for a connection to finish. */ 844 /* Wait for a connection to finish. One exception is TCP Fast Open
844 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) 845 * (passive side) where data is allowed to be sent before a connection
846 * is fully established.
847 */
848 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
849 !tcp_passive_fastopen(sk)) {
845 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) 850 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
846 goto out_err; 851 goto out_err;
852 }
847 853
848 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 854 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
849 855
@@ -1042,10 +1048,15 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1042 1048
1043 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 1049 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1044 1050
1045 /* Wait for a connection to finish. */ 1051 /* Wait for a connection to finish. One exception is TCP Fast Open
1046 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) 1052 * (passive side) where data is allowed to be sent before a connection
1053 * is fully established.
1054 */
1055 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1056 !tcp_passive_fastopen(sk)) {
1047 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) 1057 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
1048 goto do_error; 1058 goto do_error;
1059 }
1049 1060
1050 if (unlikely(tp->repair)) { 1061 if (unlikely(tp->repair)) {
1051 if (tp->repair_queue == TCP_RECV_QUEUE) { 1062 if (tp->repair_queue == TCP_RECV_QUEUE) {
@@ -2144,6 +2155,10 @@ void tcp_close(struct sock *sk, long timeout)
2144 * they look as CLOSING or LAST_ACK for Linux) 2155 * they look as CLOSING or LAST_ACK for Linux)
2145 * Probably, I missed some more holelets. 2156 * Probably, I missed some more holelets.
2146 * --ANK 2157 * --ANK
2158 * XXX (TFO) - To start off we don't support SYN+ACK+FIN
2159 * in a single packet! (May consider it later but will
2160 * probably need API support or TCP_CORK SYN-ACK until
2161 * data is written and socket is closed.)
2147 */ 2162 */
2148 tcp_send_fin(sk); 2163 tcp_send_fin(sk);
2149 } 2164 }
@@ -2215,8 +2230,16 @@ adjudge_to_death:
2215 } 2230 }
2216 } 2231 }
2217 2232
2218 if (sk->sk_state == TCP_CLOSE) 2233 if (sk->sk_state == TCP_CLOSE) {
2234 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
2235 /* We could get here with a non-NULL req if the socket is
2236 * aborted (e.g., closed with unread data) before 3WHS
2237 * finishes.
2238 */
2239 if (req != NULL)
2240 reqsk_fastopen_remove(sk, req, false);
2219 inet_csk_destroy_sock(sk); 2241 inet_csk_destroy_sock(sk);
2242 }
2220 /* Otherwise, socket is reprieved until protocol close. */ 2243 /* Otherwise, socket is reprieved until protocol close. */
2221 2244
2222out: 2245out:
@@ -2688,6 +2711,14 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2688 else 2711 else
2689 icsk->icsk_user_timeout = msecs_to_jiffies(val); 2712 icsk->icsk_user_timeout = msecs_to_jiffies(val);
2690 break; 2713 break;
2714
2715 case TCP_FASTOPEN:
2716 if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
2717 TCPF_LISTEN)))
2718 err = fastopen_init_queue(sk, val);
2719 else
2720 err = -EINVAL;
2721 break;
2691 default: 2722 default:
2692 err = -ENOPROTOOPT; 2723 err = -ENOPROTOOPT;
2693 break; 2724 break;
@@ -3501,11 +3532,15 @@ EXPORT_SYMBOL(tcp_cookie_generator);
3501 3532
3502void tcp_done(struct sock *sk) 3533void tcp_done(struct sock *sk)
3503{ 3534{
3535 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
3536
3504 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) 3537 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
3505 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); 3538 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
3506 3539
3507 tcp_set_state(sk, TCP_CLOSE); 3540 tcp_set_state(sk, TCP_CLOSE);
3508 tcp_clear_xmit_timers(sk); 3541 tcp_clear_xmit_timers(sk);
3542 if (req != NULL)
3543 reqsk_fastopen_remove(sk, req, false);
3509 3544
3510 sk->sk_shutdown = SHUTDOWN_MASK; 3545 sk->sk_shutdown = SHUTDOWN_MASK;
3511 3546
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index a7f729c409d7..8f7ef0ad80e5 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -1,10 +1,91 @@
1#include <linux/err.h>
1#include <linux/init.h> 2#include <linux/init.h>
2#include <linux/kernel.h> 3#include <linux/kernel.h>
4#include <linux/list.h>
5#include <linux/tcp.h>
6#include <linux/rcupdate.h>
7#include <linux/rculist.h>
8#include <net/inetpeer.h>
9#include <net/tcp.h>
3 10
4int sysctl_tcp_fastopen; 11int sysctl_tcp_fastopen __read_mostly;
12
13struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
14
15static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock);
16
17static void tcp_fastopen_ctx_free(struct rcu_head *head)
18{
19 struct tcp_fastopen_context *ctx =
20 container_of(head, struct tcp_fastopen_context, rcu);
21 crypto_free_cipher(ctx->tfm);
22 kfree(ctx);
23}
24
25int tcp_fastopen_reset_cipher(void *key, unsigned int len)
26{
27 int err;
28 struct tcp_fastopen_context *ctx, *octx;
29
30 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
31 if (!ctx)
32 return -ENOMEM;
33 ctx->tfm = crypto_alloc_cipher("aes", 0, 0);
34
35 if (IS_ERR(ctx->tfm)) {
36 err = PTR_ERR(ctx->tfm);
37error: kfree(ctx);
38 pr_err("TCP: TFO aes cipher alloc error: %d\n", err);
39 return err;
40 }
41 err = crypto_cipher_setkey(ctx->tfm, key, len);
42 if (err) {
43 pr_err("TCP: TFO cipher key error: %d\n", err);
44 crypto_free_cipher(ctx->tfm);
45 goto error;
46 }
47 memcpy(ctx->key, key, len);
48
49 spin_lock(&tcp_fastopen_ctx_lock);
50
51 octx = rcu_dereference_protected(tcp_fastopen_ctx,
52 lockdep_is_held(&tcp_fastopen_ctx_lock));
53 rcu_assign_pointer(tcp_fastopen_ctx, ctx);
54 spin_unlock(&tcp_fastopen_ctx_lock);
55
56 if (octx)
57 call_rcu(&octx->rcu, tcp_fastopen_ctx_free);
58 return err;
59}
60
61/* Computes the fastopen cookie for the peer.
62 * The peer address is a 128 bits long (pad with zeros for IPv4).
63 *
64 * The caller must check foc->len to determine if a valid cookie
65 * has been generated successfully.
66*/
67void tcp_fastopen_cookie_gen(__be32 addr, struct tcp_fastopen_cookie *foc)
68{
69 __be32 peer_addr[4] = { addr, 0, 0, 0 };
70 struct tcp_fastopen_context *ctx;
71
72 rcu_read_lock();
73 ctx = rcu_dereference(tcp_fastopen_ctx);
74 if (ctx) {
75 crypto_cipher_encrypt_one(ctx->tfm,
76 foc->val,
77 (__u8 *)peer_addr);
78 foc->len = TCP_FASTOPEN_COOKIE_SIZE;
79 }
80 rcu_read_unlock();
81}
5 82
6static int __init tcp_fastopen_init(void) 83static int __init tcp_fastopen_init(void)
7{ 84{
85 __u8 key[TCP_FASTOPEN_KEY_LENGTH];
86
87 get_random_bytes(key, sizeof(key));
88 tcp_fastopen_reset_cipher(key, sizeof(key));
8 return 0; 89 return 0;
9} 90}
10 91
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bcfccc5cb8d0..8c304a400798 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -378,7 +378,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
378/* 4. Try to fixup all. It is made immediately after connection enters 378/* 4. Try to fixup all. It is made immediately after connection enters
379 * established state. 379 * established state.
380 */ 380 */
381static void tcp_init_buffer_space(struct sock *sk) 381void tcp_init_buffer_space(struct sock *sk)
382{ 382{
383 struct tcp_sock *tp = tcp_sk(sk); 383 struct tcp_sock *tp = tcp_sk(sk);
384 int maxwin; 384 int maxwin;
@@ -2930,13 +2930,14 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
2930 * tcp_xmit_retransmit_queue(). 2930 * tcp_xmit_retransmit_queue().
2931 */ 2931 */
2932static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, 2932static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
2933 int newly_acked_sacked, bool is_dupack, 2933 int prior_sacked, bool is_dupack,
2934 int flag) 2934 int flag)
2935{ 2935{
2936 struct inet_connection_sock *icsk = inet_csk(sk); 2936 struct inet_connection_sock *icsk = inet_csk(sk);
2937 struct tcp_sock *tp = tcp_sk(sk); 2937 struct tcp_sock *tp = tcp_sk(sk);
2938 int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && 2938 int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
2939 (tcp_fackets_out(tp) > tp->reordering)); 2939 (tcp_fackets_out(tp) > tp->reordering));
2940 int newly_acked_sacked = 0;
2940 int fast_rexmit = 0; 2941 int fast_rexmit = 0;
2941 2942
2942 if (WARN_ON(!tp->packets_out && tp->sacked_out)) 2943 if (WARN_ON(!tp->packets_out && tp->sacked_out))
@@ -2996,6 +2997,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
2996 tcp_add_reno_sack(sk); 2997 tcp_add_reno_sack(sk);
2997 } else 2998 } else
2998 do_lost = tcp_try_undo_partial(sk, pkts_acked); 2999 do_lost = tcp_try_undo_partial(sk, pkts_acked);
3000 newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked;
2999 break; 3001 break;
3000 case TCP_CA_Loss: 3002 case TCP_CA_Loss:
3001 if (flag & FLAG_DATA_ACKED) 3003 if (flag & FLAG_DATA_ACKED)
@@ -3017,6 +3019,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
3017 if (is_dupack) 3019 if (is_dupack)
3018 tcp_add_reno_sack(sk); 3020 tcp_add_reno_sack(sk);
3019 } 3021 }
3022 newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked;
3020 3023
3021 if (icsk->icsk_ca_state <= TCP_CA_Disorder) 3024 if (icsk->icsk_ca_state <= TCP_CA_Disorder)
3022 tcp_try_undo_dsack(sk); 3025 tcp_try_undo_dsack(sk);
@@ -3124,6 +3127,12 @@ void tcp_rearm_rto(struct sock *sk)
3124{ 3127{
3125 struct tcp_sock *tp = tcp_sk(sk); 3128 struct tcp_sock *tp = tcp_sk(sk);
3126 3129
3130 /* If the retrans timer is currently being used by Fast Open
3131 * for SYN-ACK retrans purpose, stay put.
3132 */
3133 if (tp->fastopen_rsk)
3134 return;
3135
3127 if (!tp->packets_out) { 3136 if (!tp->packets_out) {
3128 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); 3137 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
3129 } else { 3138 } else {
@@ -3594,7 +3603,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3594 int prior_packets; 3603 int prior_packets;
3595 int prior_sacked = tp->sacked_out; 3604 int prior_sacked = tp->sacked_out;
3596 int pkts_acked = 0; 3605 int pkts_acked = 0;
3597 int newly_acked_sacked = 0;
3598 bool frto_cwnd = false; 3606 bool frto_cwnd = false;
3599 3607
3600 /* If the ack is older than previous acks 3608 /* If the ack is older than previous acks
@@ -3670,8 +3678,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3670 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); 3678 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);
3671 3679
3672 pkts_acked = prior_packets - tp->packets_out; 3680 pkts_acked = prior_packets - tp->packets_out;
3673 newly_acked_sacked = (prior_packets - prior_sacked) -
3674 (tp->packets_out - tp->sacked_out);
3675 3681
3676 if (tp->frto_counter) 3682 if (tp->frto_counter)
3677 frto_cwnd = tcp_process_frto(sk, flag); 3683 frto_cwnd = tcp_process_frto(sk, flag);
@@ -3685,7 +3691,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3685 tcp_may_raise_cwnd(sk, flag)) 3691 tcp_may_raise_cwnd(sk, flag))
3686 tcp_cong_avoid(sk, ack, prior_in_flight); 3692 tcp_cong_avoid(sk, ack, prior_in_flight);
3687 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); 3693 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
3688 tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked, 3694 tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
3689 is_dupack, flag); 3695 is_dupack, flag);
3690 } else { 3696 } else {
3691 if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) 3697 if ((flag & FLAG_DATA_ACKED) && !frto_cwnd)
@@ -3702,7 +3708,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3702no_queue: 3708no_queue:
3703 /* If data was DSACKed, see if we can undo a cwnd reduction. */ 3709 /* If data was DSACKed, see if we can undo a cwnd reduction. */
3704 if (flag & FLAG_DSACKING_ACK) 3710 if (flag & FLAG_DSACKING_ACK)
3705 tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked, 3711 tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
3706 is_dupack, flag); 3712 is_dupack, flag);
3707 /* If this ack opens up a zero window, clear backoff. It was 3713 /* If this ack opens up a zero window, clear backoff. It was
3708 * being used to time the probes, and is probably far higher than 3714 * being used to time the probes, and is probably far higher than
@@ -3722,8 +3728,7 @@ old_ack:
3722 */ 3728 */
3723 if (TCP_SKB_CB(skb)->sacked) { 3729 if (TCP_SKB_CB(skb)->sacked) {
3724 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); 3730 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
3725 newly_acked_sacked = tp->sacked_out - prior_sacked; 3731 tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
3726 tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked,
3727 is_dupack, flag); 3732 is_dupack, flag);
3728 } 3733 }
3729 3734
@@ -4039,7 +4044,7 @@ static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
4039} 4044}
4040 4045
4041/* When we get a reset we do this. */ 4046/* When we get a reset we do this. */
4042static void tcp_reset(struct sock *sk) 4047void tcp_reset(struct sock *sk)
4043{ 4048{
4044 /* We want the right error as BSD sees it (and indeed as we do). */ 4049 /* We want the right error as BSD sees it (and indeed as we do). */
4045 switch (sk->sk_state) { 4050 switch (sk->sk_state) {
@@ -5896,7 +5901,9 @@ discard:
5896 tcp_send_synack(sk); 5901 tcp_send_synack(sk);
5897#if 0 5902#if 0
5898 /* Note, we could accept data and URG from this segment. 5903 /* Note, we could accept data and URG from this segment.
5899 * There are no obstacles to make this. 5904 * There are no obstacles to make this (except that we must
5905 * either change tcp_recvmsg() to prevent it from returning data
5906 * before 3WHS completes per RFC793, or employ TCP Fast Open).
5900 * 5907 *
5901 * However, if we ignore data in ACKless segments sometimes, 5908 * However, if we ignore data in ACKless segments sometimes,
5902 * we have no reasons to accept it sometimes. 5909 * we have no reasons to accept it sometimes.
@@ -5936,6 +5943,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5936{ 5943{
5937 struct tcp_sock *tp = tcp_sk(sk); 5944 struct tcp_sock *tp = tcp_sk(sk);
5938 struct inet_connection_sock *icsk = inet_csk(sk); 5945 struct inet_connection_sock *icsk = inet_csk(sk);
5946 struct request_sock *req;
5939 int queued = 0; 5947 int queued = 0;
5940 5948
5941 tp->rx_opt.saw_tstamp = 0; 5949 tp->rx_opt.saw_tstamp = 0;
@@ -5991,7 +5999,14 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5991 return 0; 5999 return 0;
5992 } 6000 }
5993 6001
5994 if (!tcp_validate_incoming(sk, skb, th, 0)) 6002 req = tp->fastopen_rsk;
6003 if (req != NULL) {
6004 BUG_ON(sk->sk_state != TCP_SYN_RECV &&
6005 sk->sk_state != TCP_FIN_WAIT1);
6006
6007 if (tcp_check_req(sk, skb, req, NULL, true) == NULL)
6008 goto discard;
6009 } else if (!tcp_validate_incoming(sk, skb, th, 0))
5995 return 0; 6010 return 0;
5996 6011
5997 /* step 5: check the ACK field */ 6012 /* step 5: check the ACK field */
@@ -6001,7 +6016,22 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
6001 switch (sk->sk_state) { 6016 switch (sk->sk_state) {
6002 case TCP_SYN_RECV: 6017 case TCP_SYN_RECV:
6003 if (acceptable) { 6018 if (acceptable) {
6004 tp->copied_seq = tp->rcv_nxt; 6019 /* Once we leave TCP_SYN_RECV, we no longer
6020 * need req so release it.
6021 */
6022 if (req) {
6023 reqsk_fastopen_remove(sk, req, false);
6024 } else {
6025 /* Make sure socket is routed, for
6026 * correct metrics.
6027 */
6028 icsk->icsk_af_ops->rebuild_header(sk);
6029 tcp_init_congestion_control(sk);
6030
6031 tcp_mtup_init(sk);
6032 tcp_init_buffer_space(sk);
6033 tp->copied_seq = tp->rcv_nxt;
6034 }
6005 smp_mb(); 6035 smp_mb();
6006 tcp_set_state(sk, TCP_ESTABLISHED); 6036 tcp_set_state(sk, TCP_ESTABLISHED);
6007 sk->sk_state_change(sk); 6037 sk->sk_state_change(sk);
@@ -6023,23 +6053,27 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
6023 if (tp->rx_opt.tstamp_ok) 6053 if (tp->rx_opt.tstamp_ok)
6024 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 6054 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
6025 6055
6026 /* Make sure socket is routed, for 6056 if (req) {
6027 * correct metrics. 6057 /* Re-arm the timer because data may
6028 */ 6058 * have been sent out. This is similar
6029 icsk->icsk_af_ops->rebuild_header(sk); 6059 * to the regular data transmission case
6030 6060 * when new data has just been ack'ed.
6031 tcp_init_metrics(sk); 6061 *
6032 6062 * (TFO) - we could try to be more
6033 tcp_init_congestion_control(sk); 6063 * aggressive and retranmitting any data
6064 * sooner based on when they were sent
6065 * out.
6066 */
6067 tcp_rearm_rto(sk);
6068 } else
6069 tcp_init_metrics(sk);
6034 6070
6035 /* Prevent spurious tcp_cwnd_restart() on 6071 /* Prevent spurious tcp_cwnd_restart() on
6036 * first data packet. 6072 * first data packet.
6037 */ 6073 */
6038 tp->lsndtime = tcp_time_stamp; 6074 tp->lsndtime = tcp_time_stamp;
6039 6075
6040 tcp_mtup_init(sk);
6041 tcp_initialize_rcv_mss(sk); 6076 tcp_initialize_rcv_mss(sk);
6042 tcp_init_buffer_space(sk);
6043 tcp_fast_path_on(tp); 6077 tcp_fast_path_on(tp);
6044 } else { 6078 } else {
6045 return 1; 6079 return 1;
@@ -6047,6 +6081,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
6047 break; 6081 break;
6048 6082
6049 case TCP_FIN_WAIT1: 6083 case TCP_FIN_WAIT1:
6084 /* If we enter the TCP_FIN_WAIT1 state and we are a
6085 * Fast Open socket and this is the first acceptable
6086 * ACK we have received, this would have acknowledged
6087 * our SYNACK so stop the SYNACK timer.
6088 */
6089 if (acceptable && req != NULL) {
6090 /* We no longer need the request sock. */
6091 reqsk_fastopen_remove(sk, req, false);
6092 tcp_rearm_rto(sk);
6093 }
6050 if (tp->snd_una == tp->write_seq) { 6094 if (tp->snd_una == tp->write_seq) {
6051 struct dst_entry *dst; 6095 struct dst_entry *dst;
6052 6096
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1e15c5be04e7..e64abed249cc 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -352,6 +352,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
352 const int code = icmp_hdr(icmp_skb)->code; 352 const int code = icmp_hdr(icmp_skb)->code;
353 struct sock *sk; 353 struct sock *sk;
354 struct sk_buff *skb; 354 struct sk_buff *skb;
355 struct request_sock *req;
355 __u32 seq; 356 __u32 seq;
356 __u32 remaining; 357 __u32 remaining;
357 int err; 358 int err;
@@ -394,9 +395,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
394 395
395 icsk = inet_csk(sk); 396 icsk = inet_csk(sk);
396 tp = tcp_sk(sk); 397 tp = tcp_sk(sk);
398 req = tp->fastopen_rsk;
397 seq = ntohl(th->seq); 399 seq = ntohl(th->seq);
398 if (sk->sk_state != TCP_LISTEN && 400 if (sk->sk_state != TCP_LISTEN &&
399 !between(seq, tp->snd_una, tp->snd_nxt)) { 401 !between(seq, tp->snd_una, tp->snd_nxt) &&
402 (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
403 /* For a Fast Open socket, allow seq to be snt_isn. */
400 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); 404 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
401 goto out; 405 goto out;
402 } 406 }
@@ -435,6 +439,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
435 !icsk->icsk_backoff) 439 !icsk->icsk_backoff)
436 break; 440 break;
437 441
442 /* XXX (TFO) - revisit the following logic for TFO */
443
438 if (sock_owned_by_user(sk)) 444 if (sock_owned_by_user(sk))
439 break; 445 break;
440 446
@@ -466,6 +472,14 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
466 goto out; 472 goto out;
467 } 473 }
468 474
475 /* XXX (TFO) - if it's a TFO socket and has been accepted, rather
476 * than following the TCP_SYN_RECV case and closing the socket,
477 * we ignore the ICMP error and keep trying like a fully established
478 * socket. Is this the right thing to do?
479 */
480 if (req && req->sk == NULL)
481 goto out;
482
469 switch (sk->sk_state) { 483 switch (sk->sk_state) {
470 struct request_sock *req, **prev; 484 struct request_sock *req, **prev;
471 case TCP_LISTEN: 485 case TCP_LISTEN:
@@ -498,7 +512,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
498 512
499 case TCP_SYN_SENT: 513 case TCP_SYN_SENT:
500 case TCP_SYN_RECV: /* Cannot happen. 514 case TCP_SYN_RECV: /* Cannot happen.
501 It can f.e. if SYNs crossed. 515 It can f.e. if SYNs crossed,
516 or Fast Open.
502 */ 517 */
503 if (!sock_owned_by_user(sk)) { 518 if (!sock_owned_by_user(sk)) {
504 sk->sk_err = err; 519 sk->sk_err = err;
@@ -809,8 +824,12 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
809static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, 824static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
810 struct request_sock *req) 825 struct request_sock *req)
811{ 826{
812 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, 827 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
813 tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, 828 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
829 */
830 tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
831 tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
832 tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
814 req->ts_recent, 833 req->ts_recent,
815 0, 834 0,
816 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, 835 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
@@ -839,7 +858,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
839 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 858 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
840 return -1; 859 return -1;
841 860
842 skb = tcp_make_synack(sk, dst, req, rvp); 861 skb = tcp_make_synack(sk, dst, req, rvp, NULL);
843 862
844 if (skb) { 863 if (skb) {
845 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr); 864 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
@@ -1272,6 +1291,178 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1272}; 1291};
1273#endif 1292#endif
1274 1293
1294static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
1295 struct request_sock *req,
1296 struct tcp_fastopen_cookie *foc,
1297 struct tcp_fastopen_cookie *valid_foc)
1298{
1299 bool skip_cookie = false;
1300 struct fastopen_queue *fastopenq;
1301
1302 if (likely(!fastopen_cookie_present(foc))) {
1303 /* See include/net/tcp.h for the meaning of these knobs */
1304 if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
1305 ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
1306 (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
1307 skip_cookie = true; /* no cookie to validate */
1308 else
1309 return false;
1310 }
1311 fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
1312 /* A FO option is present; bump the counter. */
1313 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
1314
1315 /* Make sure the listener has enabled fastopen, and we don't
1316 * exceed the max # of pending TFO requests allowed before trying
1317 * to validating the cookie in order to avoid burning CPU cycles
1318 * unnecessarily.
1319 *
1320 * XXX (TFO) - The implication of checking the max_qlen before
1321 * processing a cookie request is that clients can't differentiate
1322 * between qlen overflow causing Fast Open to be disabled
1323 * temporarily vs a server not supporting Fast Open at all.
1324 */
1325 if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
1326 fastopenq == NULL || fastopenq->max_qlen == 0)
1327 return false;
1328
1329 if (fastopenq->qlen >= fastopenq->max_qlen) {
1330 struct request_sock *req1;
1331 spin_lock(&fastopenq->lock);
1332 req1 = fastopenq->rskq_rst_head;
1333 if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
1334 spin_unlock(&fastopenq->lock);
1335 NET_INC_STATS_BH(sock_net(sk),
1336 LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
1337 /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
1338 foc->len = -1;
1339 return false;
1340 }
1341 fastopenq->rskq_rst_head = req1->dl_next;
1342 fastopenq->qlen--;
1343 spin_unlock(&fastopenq->lock);
1344 reqsk_free(req1);
1345 }
1346 if (skip_cookie) {
1347 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1348 return true;
1349 }
1350 if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
1351 if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
1352 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1353 if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
1354 memcmp(&foc->val[0], &valid_foc->val[0],
1355 TCP_FASTOPEN_COOKIE_SIZE) != 0)
1356 return false;
1357 valid_foc->len = -1;
1358 }
1359 /* Acknowledge the data received from the peer. */
1360 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1361 return true;
1362 } else if (foc->len == 0) { /* Client requesting a cookie */
1363 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1364 NET_INC_STATS_BH(sock_net(sk),
1365 LINUX_MIB_TCPFASTOPENCOOKIEREQD);
1366 } else {
1367 /* Client sent a cookie with wrong size. Treat it
1368 * the same as invalid and return a valid one.
1369 */
1370 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1371 }
1372 return false;
1373}
1374
1375static int tcp_v4_conn_req_fastopen(struct sock *sk,
1376 struct sk_buff *skb,
1377 struct sk_buff *skb_synack,
1378 struct request_sock *req,
1379 struct request_values *rvp)
1380{
1381 struct tcp_sock *tp = tcp_sk(sk);
1382 struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
1383 const struct inet_request_sock *ireq = inet_rsk(req);
1384 struct sock *child;
1385
1386 req->retrans = 0;
1387 req->sk = NULL;
1388
1389 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
1390 if (child == NULL) {
1391 NET_INC_STATS_BH(sock_net(sk),
1392 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1393 kfree_skb(skb_synack);
1394 return -1;
1395 }
1396 ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1397 ireq->rmt_addr, ireq->opt);
1398 /* XXX (TFO) - is it ok to ignore error and continue? */
1399
1400 spin_lock(&queue->fastopenq->lock);
1401 queue->fastopenq->qlen++;
1402 spin_unlock(&queue->fastopenq->lock);
1403
1404 /* Initialize the child socket. Have to fix some values to take
1405 * into account the child is a Fast Open socket and is created
1406 * only out of the bits carried in the SYN packet.
1407 */
1408 tp = tcp_sk(child);
1409
1410 tp->fastopen_rsk = req;
1411 /* Do a hold on the listner sk so that if the listener is being
1412 * closed, the child that has been accepted can live on and still
1413 * access listen_lock.
1414 */
1415 sock_hold(sk);
1416 tcp_rsk(req)->listener = sk;
1417
1418 /* RFC1323: The window in SYN & SYN/ACK segments is never
1419 * scaled. So correct it appropriately.
1420 */
1421 tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
1422
1423 /* Activate the retrans timer so that SYNACK can be retransmitted.
1424 * The request socket is not added to the SYN table of the parent
1425 * because it's been added to the accept queue directly.
1426 */
1427 inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
1428 TCP_TIMEOUT_INIT, TCP_RTO_MAX);
1429
1430 /* Add the child socket directly into the accept queue */
1431 inet_csk_reqsk_queue_add(sk, req, child);
1432
1433 /* Now finish processing the fastopen child socket. */
1434 inet_csk(child)->icsk_af_ops->rebuild_header(child);
1435 tcp_init_congestion_control(child);
1436 tcp_mtup_init(child);
1437 tcp_init_buffer_space(child);
1438 tcp_init_metrics(child);
1439
1440 /* Queue the data carried in the SYN packet. We need to first
1441 * bump skb's refcnt because the caller will attempt to free it.
1442 *
1443 * XXX (TFO) - we honor a zero-payload TFO request for now.
1444 * (Any reason not to?)
1445 */
1446 if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
1447 /* Don't queue the skb if there is no payload in SYN.
1448 * XXX (TFO) - How about SYN+FIN?
1449 */
1450 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1451 } else {
1452 skb = skb_get(skb);
1453 skb_dst_drop(skb);
1454 __skb_pull(skb, tcp_hdr(skb)->doff * 4);
1455 skb_set_owner_r(skb, child);
1456 __skb_queue_tail(&child->sk_receive_queue, skb);
1457 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1458 }
1459 sk->sk_data_ready(sk, 0);
1460 bh_unlock_sock(child);
1461 sock_put(child);
1462 WARN_ON(req->sk == NULL);
1463 return 0;
1464}
1465
1275int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1466int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1276{ 1467{
1277 struct tcp_extend_values tmp_ext; 1468 struct tcp_extend_values tmp_ext;
@@ -1285,6 +1476,11 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1285 __be32 daddr = ip_hdr(skb)->daddr; 1476 __be32 daddr = ip_hdr(skb)->daddr;
1286 __u32 isn = TCP_SKB_CB(skb)->when; 1477 __u32 isn = TCP_SKB_CB(skb)->when;
1287 bool want_cookie = false; 1478 bool want_cookie = false;
1479 struct flowi4 fl4;
1480 struct tcp_fastopen_cookie foc = { .len = -1 };
1481 struct tcp_fastopen_cookie valid_foc = { .len = -1 };
1482 struct sk_buff *skb_synack;
1483 int do_fastopen;
1288 1484
1289 /* Never answer to SYNs send to broadcast or multicast */ 1485 /* Never answer to SYNs send to broadcast or multicast */
1290 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1486 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
@@ -1319,7 +1515,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1319 tcp_clear_options(&tmp_opt); 1515 tcp_clear_options(&tmp_opt);
1320 tmp_opt.mss_clamp = TCP_MSS_DEFAULT; 1516 tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1321 tmp_opt.user_mss = tp->rx_opt.user_mss; 1517 tmp_opt.user_mss = tp->rx_opt.user_mss;
1322 tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); 1518 tcp_parse_options(skb, &tmp_opt, &hash_location, 0,
1519 want_cookie ? NULL : &foc);
1323 1520
1324 if (tmp_opt.cookie_plus > 0 && 1521 if (tmp_opt.cookie_plus > 0 &&
1325 tmp_opt.saw_tstamp && 1522 tmp_opt.saw_tstamp &&
@@ -1377,8 +1574,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1377 isn = cookie_v4_init_sequence(sk, skb, &req->mss); 1574 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1378 req->cookie_ts = tmp_opt.tstamp_ok; 1575 req->cookie_ts = tmp_opt.tstamp_ok;
1379 } else if (!isn) { 1576 } else if (!isn) {
1380 struct flowi4 fl4;
1381
1382 /* VJ's idea. We save last timestamp seen 1577 /* VJ's idea. We save last timestamp seen
1383 * from the destination in peer table, when entering 1578 * from the destination in peer table, when entering
1384 * state TIME-WAIT, and check against it before 1579 * state TIME-WAIT, and check against it before
@@ -1419,14 +1614,52 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1419 tcp_rsk(req)->snt_isn = isn; 1614 tcp_rsk(req)->snt_isn = isn;
1420 tcp_rsk(req)->snt_synack = tcp_time_stamp; 1615 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1421 1616
1422 if (tcp_v4_send_synack(sk, dst, req, 1617 if (dst == NULL) {
1423 (struct request_values *)&tmp_ext, 1618 dst = inet_csk_route_req(sk, &fl4, req);
1424 skb_get_queue_mapping(skb), 1619 if (dst == NULL)
1425 want_cookie) || 1620 goto drop_and_free;
1426 want_cookie) 1621 }
1622 do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
1623
1624 /* We don't call tcp_v4_send_synack() directly because we need
1625 * to make sure a child socket can be created successfully before
1626 * sending back synack!
1627 *
1628 * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
1629 * (or better yet, call tcp_send_synack() in the child context
1630 * directly, but will have to fix bunch of other code first)
1631 * after syn_recv_sock() except one will need to first fix the
1632 * latter to remove its dependency on the current implementation
1633 * of tcp_v4_send_synack()->tcp_select_initial_window().
1634 */
1635 skb_synack = tcp_make_synack(sk, dst, req,
1636 (struct request_values *)&tmp_ext,
1637 fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
1638
1639 if (skb_synack) {
1640 __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
1641 skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
1642 } else
1643 goto drop_and_free;
1644
1645 if (likely(!do_fastopen)) {
1646 int err;
1647 err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1648 ireq->rmt_addr, ireq->opt);
1649 err = net_xmit_eval(err);
1650 if (err || want_cookie)
1651 goto drop_and_free;
1652
1653 tcp_rsk(req)->listener = NULL;
1654 /* Add the request_sock to the SYN table */
1655 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1656 if (fastopen_cookie_present(&foc) && foc.len != 0)
1657 NET_INC_STATS_BH(sock_net(sk),
1658 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1659 } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req,
1660 (struct request_values *)&tmp_ext))
1427 goto drop_and_free; 1661 goto drop_and_free;
1428 1662
1429 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1430 return 0; 1663 return 0;
1431 1664
1432drop_and_release: 1665drop_and_release:
@@ -1554,7 +1787,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1554 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, 1787 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1555 iph->saddr, iph->daddr); 1788 iph->saddr, iph->daddr);
1556 if (req) 1789 if (req)
1557 return tcp_check_req(sk, skb, req, prev); 1790 return tcp_check_req(sk, skb, req, prev, false);
1558 1791
1559 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, 1792 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1560 th->source, iph->daddr, th->dest, inet_iif(skb)); 1793 th->source, iph->daddr, th->dest, inet_iif(skb));
@@ -1977,6 +2210,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
1977 tcp_cookie_values_release); 2210 tcp_cookie_values_release);
1978 tp->cookie_values = NULL; 2211 tp->cookie_values = NULL;
1979 } 2212 }
2213 BUG_ON(tp->fastopen_rsk != NULL);
1980 2214
1981 /* If socket is aborted during connect operation */ 2215 /* If socket is aborted during connect operation */
1982 tcp_free_fastopen_req(tp); 2216 tcp_free_fastopen_req(tp);
@@ -2393,7 +2627,7 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2393EXPORT_SYMBOL(tcp_proc_unregister); 2627EXPORT_SYMBOL(tcp_proc_unregister);
2394 2628
2395static void get_openreq4(const struct sock *sk, const struct request_sock *req, 2629static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2396 struct seq_file *f, int i, int uid, int *len) 2630 struct seq_file *f, int i, kuid_t uid, int *len)
2397{ 2631{
2398 const struct inet_request_sock *ireq = inet_rsk(req); 2632 const struct inet_request_sock *ireq = inet_rsk(req);
2399 long delta = req->expires - jiffies; 2633 long delta = req->expires - jiffies;
@@ -2410,7 +2644,7 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2410 1, /* timers active (only the expire timer) */ 2644 1, /* timers active (only the expire timer) */
2411 jiffies_delta_to_clock_t(delta), 2645 jiffies_delta_to_clock_t(delta),
2412 req->retrans, 2646 req->retrans,
2413 uid, 2647 from_kuid_munged(seq_user_ns(f), uid),
2414 0, /* non standard timer */ 2648 0, /* non standard timer */
2415 0, /* open_requests have no inode */ 2649 0, /* open_requests have no inode */
2416 atomic_read(&sk->sk_refcnt), 2650 atomic_read(&sk->sk_refcnt),
@@ -2425,6 +2659,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2425 const struct tcp_sock *tp = tcp_sk(sk); 2659 const struct tcp_sock *tp = tcp_sk(sk);
2426 const struct inet_connection_sock *icsk = inet_csk(sk); 2660 const struct inet_connection_sock *icsk = inet_csk(sk);
2427 const struct inet_sock *inet = inet_sk(sk); 2661 const struct inet_sock *inet = inet_sk(sk);
2662 struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2428 __be32 dest = inet->inet_daddr; 2663 __be32 dest = inet->inet_daddr;
2429 __be32 src = inet->inet_rcv_saddr; 2664 __be32 src = inet->inet_rcv_saddr;
2430 __u16 destp = ntohs(inet->inet_dport); 2665 __u16 destp = ntohs(inet->inet_dport);
@@ -2461,7 +2696,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2461 timer_active, 2696 timer_active,
2462 jiffies_delta_to_clock_t(timer_expires - jiffies), 2697 jiffies_delta_to_clock_t(timer_expires - jiffies),
2463 icsk->icsk_retransmits, 2698 icsk->icsk_retransmits,
2464 sock_i_uid(sk), 2699 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2465 icsk->icsk_probes_out, 2700 icsk->icsk_probes_out,
2466 sock_i_ino(sk), 2701 sock_i_ino(sk),
2467 atomic_read(&sk->sk_refcnt), sk, 2702 atomic_read(&sk->sk_refcnt), sk,
@@ -2469,7 +2704,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2469 jiffies_to_clock_t(icsk->icsk_ack.ato), 2704 jiffies_to_clock_t(icsk->icsk_ack.ato),
2470 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, 2705 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2471 tp->snd_cwnd, 2706 tp->snd_cwnd,
2472 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh, 2707 sk->sk_state == TCP_LISTEN ?
2708 (fastopenq ? fastopenq->max_qlen : 0) :
2709 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
2473 len); 2710 len);
2474} 2711}
2475 2712
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 6ff7f10dce9d..e965319d610b 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -507,6 +507,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
507 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; 507 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
508 newtp->rx_opt.mss_clamp = req->mss; 508 newtp->rx_opt.mss_clamp = req->mss;
509 TCP_ECN_openreq_child(newtp, req); 509 TCP_ECN_openreq_child(newtp, req);
510 newtp->fastopen_rsk = NULL;
510 511
511 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); 512 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
512 } 513 }
@@ -515,13 +516,18 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
515EXPORT_SYMBOL(tcp_create_openreq_child); 516EXPORT_SYMBOL(tcp_create_openreq_child);
516 517
517/* 518/*
518 * Process an incoming packet for SYN_RECV sockets represented 519 * Process an incoming packet for SYN_RECV sockets represented as a
519 * as a request_sock. 520 * request_sock. Normally sk is the listener socket but for TFO it
521 * points to the child socket.
522 *
523 * XXX (TFO) - The current impl contains a special check for ack
524 * validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
520 */ 525 */
521 526
522struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, 527struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
523 struct request_sock *req, 528 struct request_sock *req,
524 struct request_sock **prev) 529 struct request_sock **prev,
530 bool fastopen)
525{ 531{
526 struct tcp_options_received tmp_opt; 532 struct tcp_options_received tmp_opt;
527 const u8 *hash_location; 533 const u8 *hash_location;
@@ -530,6 +536,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
530 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); 536 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
531 bool paws_reject = false; 537 bool paws_reject = false;
532 538
539 BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));
540
533 tmp_opt.saw_tstamp = 0; 541 tmp_opt.saw_tstamp = 0;
534 if (th->doff > (sizeof(struct tcphdr)>>2)) { 542 if (th->doff > (sizeof(struct tcphdr)>>2)) {
535 tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); 543 tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
@@ -565,6 +573,9 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
565 * 573 *
566 * Enforce "SYN-ACK" according to figure 8, figure 6 574 * Enforce "SYN-ACK" according to figure 8, figure 6
567 * of RFC793, fixed by RFC1122. 575 * of RFC793, fixed by RFC1122.
576 *
577 * Note that even if there is new data in the SYN packet
578 * they will be thrown away too.
568 */ 579 */
569 req->rsk_ops->rtx_syn_ack(sk, req, NULL); 580 req->rsk_ops->rtx_syn_ack(sk, req, NULL);
570 return NULL; 581 return NULL;
@@ -622,9 +633,12 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
622 * sent (the segment carries an unacceptable ACK) ... 633 * sent (the segment carries an unacceptable ACK) ...
623 * a reset is sent." 634 * a reset is sent."
624 * 635 *
625 * Invalid ACK: reset will be sent by listening socket 636 * Invalid ACK: reset will be sent by listening socket.
637 * Note that the ACK validity check for a Fast Open socket is done
638 * elsewhere and is checked directly against the child socket rather
639 * than req because user data may have been sent out.
626 */ 640 */
627 if ((flg & TCP_FLAG_ACK) && 641 if ((flg & TCP_FLAG_ACK) && !fastopen &&
628 (TCP_SKB_CB(skb)->ack_seq != 642 (TCP_SKB_CB(skb)->ack_seq !=
629 tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk)))) 643 tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk))))
630 return sk; 644 return sk;
@@ -637,7 +651,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
637 /* RFC793: "first check sequence number". */ 651 /* RFC793: "first check sequence number". */
638 652
639 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, 653 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
640 tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) { 654 tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) {
641 /* Out of window: send ACK and drop. */ 655 /* Out of window: send ACK and drop. */
642 if (!(flg & TCP_FLAG_RST)) 656 if (!(flg & TCP_FLAG_RST))
643 req->rsk_ops->send_ack(sk, skb, req); 657 req->rsk_ops->send_ack(sk, skb, req);
@@ -648,7 +662,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
648 662
649 /* In sequence, PAWS is OK. */ 663 /* In sequence, PAWS is OK. */
650 664
651 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1)) 665 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
652 req->ts_recent = tmp_opt.rcv_tsval; 666 req->ts_recent = tmp_opt.rcv_tsval;
653 667
654 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { 668 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
@@ -667,10 +681,19 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
667 681
668 /* ACK sequence verified above, just make sure ACK is 682 /* ACK sequence verified above, just make sure ACK is
669 * set. If ACK not set, just silently drop the packet. 683 * set. If ACK not set, just silently drop the packet.
684 *
685 * XXX (TFO) - if we ever allow "data after SYN", the
686 * following check needs to be removed.
670 */ 687 */
671 if (!(flg & TCP_FLAG_ACK)) 688 if (!(flg & TCP_FLAG_ACK))
672 return NULL; 689 return NULL;
673 690
691 /* For Fast Open no more processing is needed (sk is the
692 * child socket).
693 */
694 if (fastopen)
695 return sk;
696
674 /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ 697 /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
675 if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && 698 if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
676 TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { 699 TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
@@ -706,11 +729,21 @@ listen_overflow:
706 } 729 }
707 730
708embryonic_reset: 731embryonic_reset:
709 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); 732 if (!(flg & TCP_FLAG_RST)) {
710 if (!(flg & TCP_FLAG_RST)) 733 /* Received a bad SYN pkt - for TFO We try not to reset
734 * the local connection unless it's really necessary to
735 * avoid becoming vulnerable to outside attack aiming at
736 * resetting legit local connections.
737 */
711 req->rsk_ops->send_reset(sk, skb); 738 req->rsk_ops->send_reset(sk, skb);
712 739 } else if (fastopen) { /* received a valid RST pkt */
713 inet_csk_reqsk_queue_drop(sk, req, prev); 740 reqsk_fastopen_remove(sk, req, true);
741 tcp_reset(sk);
742 }
743 if (!fastopen) {
744 inet_csk_reqsk_queue_drop(sk, req, prev);
745 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
746 }
714 return NULL; 747 return NULL;
715} 748}
716EXPORT_SYMBOL(tcp_check_req); 749EXPORT_SYMBOL(tcp_check_req);
@@ -719,6 +752,12 @@ EXPORT_SYMBOL(tcp_check_req);
719 * Queue segment on the new socket if the new socket is active, 752 * Queue segment on the new socket if the new socket is active,
720 * otherwise we just shortcircuit this and continue with 753 * otherwise we just shortcircuit this and continue with
721 * the new socket. 754 * the new socket.
755 *
756 * For the vast majority of cases child->sk_state will be TCP_SYN_RECV
757 * when entering. But other states are possible due to a race condition
758 * where after __inet_lookup_established() fails but before the listener
759 * locked is obtained, other packets cause the same connection to
760 * be created.
722 */ 761 */
723 762
724int tcp_child_process(struct sock *parent, struct sock *child, 763int tcp_child_process(struct sock *parent, struct sock *child,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index d04632673a9e..9383b51f3efc 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -702,7 +702,8 @@ static unsigned int tcp_synack_options(struct sock *sk,
702 unsigned int mss, struct sk_buff *skb, 702 unsigned int mss, struct sk_buff *skb,
703 struct tcp_out_options *opts, 703 struct tcp_out_options *opts,
704 struct tcp_md5sig_key **md5, 704 struct tcp_md5sig_key **md5,
705 struct tcp_extend_values *xvp) 705 struct tcp_extend_values *xvp,
706 struct tcp_fastopen_cookie *foc)
706{ 707{
707 struct inet_request_sock *ireq = inet_rsk(req); 708 struct inet_request_sock *ireq = inet_rsk(req);
708 unsigned int remaining = MAX_TCP_OPTION_SPACE; 709 unsigned int remaining = MAX_TCP_OPTION_SPACE;
@@ -747,7 +748,15 @@ static unsigned int tcp_synack_options(struct sock *sk,
747 if (unlikely(!ireq->tstamp_ok)) 748 if (unlikely(!ireq->tstamp_ok))
748 remaining -= TCPOLEN_SACKPERM_ALIGNED; 749 remaining -= TCPOLEN_SACKPERM_ALIGNED;
749 } 750 }
750 751 if (foc != NULL) {
752 u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
753 need = (need + 3) & ~3U; /* Align to 32 bits */
754 if (remaining >= need) {
755 opts->options |= OPTION_FAST_OPEN_COOKIE;
756 opts->fastopen_cookie = foc;
757 remaining -= need;
758 }
759 }
751 /* Similar rationale to tcp_syn_options() applies here, too. 760 /* Similar rationale to tcp_syn_options() applies here, too.
752 * If the <SYN> options fit, the same options should fit now! 761 * If the <SYN> options fit, the same options should fit now!
753 */ 762 */
@@ -2658,7 +2667,8 @@ int tcp_send_synack(struct sock *sk)
2658 */ 2667 */
2659struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, 2668struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2660 struct request_sock *req, 2669 struct request_sock *req,
2661 struct request_values *rvp) 2670 struct request_values *rvp,
2671 struct tcp_fastopen_cookie *foc)
2662{ 2672{
2663 struct tcp_out_options opts; 2673 struct tcp_out_options opts;
2664 struct tcp_extend_values *xvp = tcp_xv(rvp); 2674 struct tcp_extend_values *xvp = tcp_xv(rvp);
@@ -2718,7 +2728,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2718#endif 2728#endif
2719 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2729 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2720 tcp_header_size = tcp_synack_options(sk, req, mss, 2730 tcp_header_size = tcp_synack_options(sk, req, mss,
2721 skb, &opts, &md5, xvp) 2731 skb, &opts, &md5, xvp, foc)
2722 + sizeof(*th); 2732 + sizeof(*th);
2723 2733
2724 skb_push(skb, tcp_header_size); 2734 skb_push(skb, tcp_header_size);
@@ -2772,7 +2782,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2772 } 2782 }
2773 2783
2774 th->seq = htonl(TCP_SKB_CB(skb)->seq); 2784 th->seq = htonl(TCP_SKB_CB(skb)->seq);
2775 th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); 2785 /* XXX data is queued and acked as is. No buffer/window check */
2786 th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
2776 2787
2777 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ 2788 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
2778 th->window = htons(min(req->rcv_wnd, 65535U)); 2789 th->window = htons(min(req->rcv_wnd, 65535U));
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b774a03bd1dc..fc04711e80c8 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -305,6 +305,35 @@ static void tcp_probe_timer(struct sock *sk)
305} 305}
306 306
307/* 307/*
308 * Timer for Fast Open socket to retransmit SYNACK. Note that the
309 * sk here is the child socket, not the parent (listener) socket.
310 */
311static void tcp_fastopen_synack_timer(struct sock *sk)
312{
313 struct inet_connection_sock *icsk = inet_csk(sk);
314 int max_retries = icsk->icsk_syn_retries ? :
315 sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
316 struct request_sock *req;
317
318 req = tcp_sk(sk)->fastopen_rsk;
319 req->rsk_ops->syn_ack_timeout(sk, req);
320
321 if (req->retrans >= max_retries) {
322 tcp_write_err(sk);
323 return;
324 }
325 /* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error
326 * returned from rtx_syn_ack() to make it more persistent like
327 * regular retransmit because if the child socket has been accepted
328 * it's not good to give up too easily.
329 */
330 req->rsk_ops->rtx_syn_ack(sk, req, NULL);
331 req->retrans++;
332 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
333 TCP_TIMEOUT_INIT << req->retrans, TCP_RTO_MAX);
334}
335
336/*
308 * The TCP retransmit timer. 337 * The TCP retransmit timer.
309 */ 338 */
310 339
@@ -317,7 +346,15 @@ void tcp_retransmit_timer(struct sock *sk)
317 tcp_resume_early_retransmit(sk); 346 tcp_resume_early_retransmit(sk);
318 return; 347 return;
319 } 348 }
320 349 if (tp->fastopen_rsk) {
350 BUG_ON(sk->sk_state != TCP_SYN_RECV &&
351 sk->sk_state != TCP_FIN_WAIT1);
352 tcp_fastopen_synack_timer(sk);
353 /* Before we receive ACK to our SYN-ACK don't retransmit
354 * anything else (e.g., data or FIN segments).
355 */
356 return;
357 }
321 if (!tp->packets_out) 358 if (!tp->packets_out)
322 goto out; 359 goto out;
323 360
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 6f6d1aca3c3d..c4e64328d8ba 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2110,7 +2110,9 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
2110 bucket, src, srcp, dest, destp, sp->sk_state, 2110 bucket, src, srcp, dest, destp, sp->sk_state,
2111 sk_wmem_alloc_get(sp), 2111 sk_wmem_alloc_get(sp),
2112 sk_rmem_alloc_get(sp), 2112 sk_rmem_alloc_get(sp),
2113 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), 2113 0, 0L, 0,
2114 from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
2115 0, sock_i_ino(sp),
2114 atomic_read(&sp->sk_refcnt), sp, 2116 atomic_read(&sp->sk_refcnt), sp,
2115 atomic_read(&sp->sk_drops), len); 2117 atomic_read(&sp->sk_drops), len);
2116} 2118}
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 16d0960062be..d2f336ea82ca 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -24,7 +24,9 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
24 if (!inet_diag_bc_sk(bc, sk)) 24 if (!inet_diag_bc_sk(bc, sk))
25 return 0; 25 return 0;
26 26
27 return inet_sk_diag_fill(sk, NULL, skb, req, NETLINK_CB(cb->skb).pid, 27 return inet_sk_diag_fill(sk, NULL, skb, req,
28 sk_user_ns(NETLINK_CB(cb->skb).ssk),
29 NETLINK_CB(cb->skb).pid,
28 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); 30 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
29} 31}
30 32
@@ -69,6 +71,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
69 goto out; 71 goto out;
70 72
71 err = inet_sk_diag_fill(sk, NULL, rep, req, 73 err = inet_sk_diag_fill(sk, NULL, rep, req,
74 sk_user_ns(NETLINK_CB(in_skb).ssk),
72 NETLINK_CB(in_skb).pid, 75 NETLINK_CB(in_skb).pid,
73 nlh->nlmsg_seq, 0, nlh); 76 nlh->nlmsg_seq, 0, nlh);
74 if (err < 0) { 77 if (err < 0) {