diff options
author | Pablo Neira Ayuso <pablo@netfilter.org> | 2012-09-03 09:28:30 -0400 |
---|---|---|
committer | Pablo Neira Ayuso <pablo@netfilter.org> | 2012-09-03 09:34:51 -0400 |
commit | ace1fe1231bdfffd60b5e703aa5b7283fbf98dbd (patch) | |
tree | 06c7492a8f3cc65f916768616ca24c6bc7171761 /net/ipv4 | |
parent | ce9f3f31efb88841e4df98794b13dbac8c4901da (diff) | |
parent | a2dc375e12334b3d8f787a48b2fb6172ccfb80ae (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
This merges (3f509c6 netfilter: nf_nat_sip: fix incorrect handling
of EBUSY for RTCP expectation) to Patrick McHardy's IPv6 NAT changes.
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/af_inet.c | 28 | ||||
-rw-r--r-- | net/ipv4/devinet.c | 6 | ||||
-rw-r--r-- | net/ipv4/fib_frontend.c | 7 | ||||
-rw-r--r-- | net/ipv4/inet_connection_sock.c | 57 | ||||
-rw-r--r-- | net/ipv4/inet_diag.c | 21 | ||||
-rw-r--r-- | net/ipv4/ipmr.c | 14 | ||||
-rw-r--r-- | net/ipv4/ping.c | 22 | ||||
-rw-r--r-- | net/ipv4/proc.c | 4 | ||||
-rw-r--r-- | net/ipv4/raw.c | 4 | ||||
-rw-r--r-- | net/ipv4/route.c | 11 | ||||
-rw-r--r-- | net/ipv4/syncookies.c | 1 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 87 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 49 | ||||
-rw-r--r-- | net/ipv4/tcp_fastopen.c | 83 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 90 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 275 | ||||
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 61 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 21 | ||||
-rw-r--r-- | net/ipv4/tcp_timer.c | 39 | ||||
-rw-r--r-- | net/ipv4/udp.c | 4 | ||||
-rw-r--r-- | net/ipv4/udp_diag.c | 5 |
21 files changed, 760 insertions, 129 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 6681ccf5c3ee..4f70ef0b946d 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c | |||
@@ -149,6 +149,11 @@ void inet_sock_destruct(struct sock *sk) | |||
149 | pr_err("Attempt to release alive inet socket %p\n", sk); | 149 | pr_err("Attempt to release alive inet socket %p\n", sk); |
150 | return; | 150 | return; |
151 | } | 151 | } |
152 | if (sk->sk_type == SOCK_STREAM) { | ||
153 | struct fastopen_queue *fastopenq = | ||
154 | inet_csk(sk)->icsk_accept_queue.fastopenq; | ||
155 | kfree(fastopenq); | ||
156 | } | ||
152 | 157 | ||
153 | WARN_ON(atomic_read(&sk->sk_rmem_alloc)); | 158 | WARN_ON(atomic_read(&sk->sk_rmem_alloc)); |
154 | WARN_ON(atomic_read(&sk->sk_wmem_alloc)); | 159 | WARN_ON(atomic_read(&sk->sk_wmem_alloc)); |
@@ -212,6 +217,26 @@ int inet_listen(struct socket *sock, int backlog) | |||
212 | * we can only allow the backlog to be adjusted. | 217 | * we can only allow the backlog to be adjusted. |
213 | */ | 218 | */ |
214 | if (old_state != TCP_LISTEN) { | 219 | if (old_state != TCP_LISTEN) { |
220 | /* Check special setups for testing purpose to enable TFO w/o | ||
221 | * requiring TCP_FASTOPEN sockopt. | ||
222 | * Note that only TCP sockets (SOCK_STREAM) will reach here. | ||
223 | * Also fastopenq may already been allocated because this | ||
224 | * socket was in TCP_LISTEN state previously but was | ||
225 | * shutdown() (rather than close()). | ||
226 | */ | ||
227 | if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 && | ||
228 | inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) { | ||
229 | if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0) | ||
230 | err = fastopen_init_queue(sk, backlog); | ||
231 | else if ((sysctl_tcp_fastopen & | ||
232 | TFO_SERVER_WO_SOCKOPT2) != 0) | ||
233 | err = fastopen_init_queue(sk, | ||
234 | ((uint)sysctl_tcp_fastopen) >> 16); | ||
235 | else | ||
236 | err = 0; | ||
237 | if (err) | ||
238 | goto out; | ||
239 | } | ||
215 | err = inet_csk_listen_start(sk, backlog); | 240 | err = inet_csk_listen_start(sk, backlog); |
216 | if (err) | 241 | if (err) |
217 | goto out; | 242 | goto out; |
@@ -701,7 +726,8 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags) | |||
701 | 726 | ||
702 | sock_rps_record_flow(sk2); | 727 | sock_rps_record_flow(sk2); |
703 | WARN_ON(!((1 << sk2->sk_state) & | 728 | WARN_ON(!((1 << sk2->sk_state) & |
704 | (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE))); | 729 | (TCPF_ESTABLISHED | TCPF_SYN_RECV | |
730 | TCPF_CLOSE_WAIT | TCPF_CLOSE))); | ||
705 | 731 | ||
706 | sock_graft(sk2, newsock); | 732 | sock_graft(sk2, newsock); |
707 | 733 | ||
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 6a5e6e4b142c..adf273f8ad2e 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c | |||
@@ -1147,12 +1147,8 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, | |||
1147 | void *ptr) | 1147 | void *ptr) |
1148 | { | 1148 | { |
1149 | struct net_device *dev = ptr; | 1149 | struct net_device *dev = ptr; |
1150 | struct in_device *in_dev; | 1150 | struct in_device *in_dev = __in_dev_get_rtnl(dev); |
1151 | |||
1152 | if (event == NETDEV_UNREGISTER_FINAL) | ||
1153 | goto out; | ||
1154 | 1151 | ||
1155 | in_dev = __in_dev_get_rtnl(dev); | ||
1156 | ASSERT_RTNL(); | 1152 | ASSERT_RTNL(); |
1157 | 1153 | ||
1158 | if (!in_dev) { | 1154 | if (!in_dev) { |
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index fd7d9ae64f16..acdee325d972 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c | |||
@@ -1050,9 +1050,6 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo | |||
1050 | return NOTIFY_DONE; | 1050 | return NOTIFY_DONE; |
1051 | } | 1051 | } |
1052 | 1052 | ||
1053 | if (event == NETDEV_UNREGISTER_FINAL) | ||
1054 | return NOTIFY_DONE; | ||
1055 | |||
1056 | in_dev = __in_dev_get_rtnl(dev); | 1053 | in_dev = __in_dev_get_rtnl(dev); |
1057 | 1054 | ||
1058 | switch (event) { | 1055 | switch (event) { |
@@ -1064,14 +1061,14 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo | |||
1064 | fib_sync_up(dev); | 1061 | fib_sync_up(dev); |
1065 | #endif | 1062 | #endif |
1066 | atomic_inc(&net->ipv4.dev_addr_genid); | 1063 | atomic_inc(&net->ipv4.dev_addr_genid); |
1067 | rt_cache_flush(dev_net(dev), -1); | 1064 | rt_cache_flush(net, -1); |
1068 | break; | 1065 | break; |
1069 | case NETDEV_DOWN: | 1066 | case NETDEV_DOWN: |
1070 | fib_disable_ip(dev, 0, 0); | 1067 | fib_disable_ip(dev, 0, 0); |
1071 | break; | 1068 | break; |
1072 | case NETDEV_CHANGEMTU: | 1069 | case NETDEV_CHANGEMTU: |
1073 | case NETDEV_CHANGE: | 1070 | case NETDEV_CHANGE: |
1074 | rt_cache_flush(dev_net(dev), 0); | 1071 | rt_cache_flush(net, 0); |
1075 | break; | 1072 | break; |
1076 | } | 1073 | } |
1077 | return NOTIFY_DONE; | 1074 | return NOTIFY_DONE; |
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 7f75f21d7b83..8464b79c493f 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c | |||
@@ -283,7 +283,9 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) | |||
283 | struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) | 283 | struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) |
284 | { | 284 | { |
285 | struct inet_connection_sock *icsk = inet_csk(sk); | 285 | struct inet_connection_sock *icsk = inet_csk(sk); |
286 | struct request_sock_queue *queue = &icsk->icsk_accept_queue; | ||
286 | struct sock *newsk; | 287 | struct sock *newsk; |
288 | struct request_sock *req; | ||
287 | int error; | 289 | int error; |
288 | 290 | ||
289 | lock_sock(sk); | 291 | lock_sock(sk); |
@@ -296,7 +298,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) | |||
296 | goto out_err; | 298 | goto out_err; |
297 | 299 | ||
298 | /* Find already established connection */ | 300 | /* Find already established connection */ |
299 | if (reqsk_queue_empty(&icsk->icsk_accept_queue)) { | 301 | if (reqsk_queue_empty(queue)) { |
300 | long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); | 302 | long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); |
301 | 303 | ||
302 | /* If this is a non blocking socket don't sleep */ | 304 | /* If this is a non blocking socket don't sleep */ |
@@ -308,14 +310,32 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) | |||
308 | if (error) | 310 | if (error) |
309 | goto out_err; | 311 | goto out_err; |
310 | } | 312 | } |
311 | 313 | req = reqsk_queue_remove(queue); | |
312 | newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk); | 314 | newsk = req->sk; |
313 | WARN_ON(newsk->sk_state == TCP_SYN_RECV); | 315 | |
316 | sk_acceptq_removed(sk); | ||
317 | if (sk->sk_type == SOCK_STREAM && queue->fastopenq != NULL) { | ||
318 | spin_lock_bh(&queue->fastopenq->lock); | ||
319 | if (tcp_rsk(req)->listener) { | ||
320 | /* We are still waiting for the final ACK from 3WHS | ||
321 | * so can't free req now. Instead, we set req->sk to | ||
322 | * NULL to signify that the child socket is taken | ||
323 | * so reqsk_fastopen_remove() will free the req | ||
324 | * when 3WHS finishes (or is aborted). | ||
325 | */ | ||
326 | req->sk = NULL; | ||
327 | req = NULL; | ||
328 | } | ||
329 | spin_unlock_bh(&queue->fastopenq->lock); | ||
330 | } | ||
314 | out: | 331 | out: |
315 | release_sock(sk); | 332 | release_sock(sk); |
333 | if (req) | ||
334 | __reqsk_free(req); | ||
316 | return newsk; | 335 | return newsk; |
317 | out_err: | 336 | out_err: |
318 | newsk = NULL; | 337 | newsk = NULL; |
338 | req = NULL; | ||
319 | *err = error; | 339 | *err = error; |
320 | goto out; | 340 | goto out; |
321 | } | 341 | } |
@@ -720,13 +740,14 @@ EXPORT_SYMBOL_GPL(inet_csk_listen_start); | |||
720 | void inet_csk_listen_stop(struct sock *sk) | 740 | void inet_csk_listen_stop(struct sock *sk) |
721 | { | 741 | { |
722 | struct inet_connection_sock *icsk = inet_csk(sk); | 742 | struct inet_connection_sock *icsk = inet_csk(sk); |
743 | struct request_sock_queue *queue = &icsk->icsk_accept_queue; | ||
723 | struct request_sock *acc_req; | 744 | struct request_sock *acc_req; |
724 | struct request_sock *req; | 745 | struct request_sock *req; |
725 | 746 | ||
726 | inet_csk_delete_keepalive_timer(sk); | 747 | inet_csk_delete_keepalive_timer(sk); |
727 | 748 | ||
728 | /* make all the listen_opt local to us */ | 749 | /* make all the listen_opt local to us */ |
729 | acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue); | 750 | acc_req = reqsk_queue_yank_acceptq(queue); |
730 | 751 | ||
731 | /* Following specs, it would be better either to send FIN | 752 | /* Following specs, it would be better either to send FIN |
732 | * (and enter FIN-WAIT-1, it is normal close) | 753 | * (and enter FIN-WAIT-1, it is normal close) |
@@ -736,7 +757,7 @@ void inet_csk_listen_stop(struct sock *sk) | |||
736 | * To be honest, we are not able to make either | 757 | * To be honest, we are not able to make either |
737 | * of the variants now. --ANK | 758 | * of the variants now. --ANK |
738 | */ | 759 | */ |
739 | reqsk_queue_destroy(&icsk->icsk_accept_queue); | 760 | reqsk_queue_destroy(queue); |
740 | 761 | ||
741 | while ((req = acc_req) != NULL) { | 762 | while ((req = acc_req) != NULL) { |
742 | struct sock *child = req->sk; | 763 | struct sock *child = req->sk; |
@@ -754,6 +775,19 @@ void inet_csk_listen_stop(struct sock *sk) | |||
754 | 775 | ||
755 | percpu_counter_inc(sk->sk_prot->orphan_count); | 776 | percpu_counter_inc(sk->sk_prot->orphan_count); |
756 | 777 | ||
778 | if (sk->sk_type == SOCK_STREAM && tcp_rsk(req)->listener) { | ||
779 | BUG_ON(tcp_sk(child)->fastopen_rsk != req); | ||
780 | BUG_ON(sk != tcp_rsk(req)->listener); | ||
781 | |||
782 | /* Paranoid, to prevent race condition if | ||
783 | * an inbound pkt destined for child is | ||
784 | * blocked by sock lock in tcp_v4_rcv(). | ||
785 | * Also to satisfy an assertion in | ||
786 | * tcp_v4_destroy_sock(). | ||
787 | */ | ||
788 | tcp_sk(child)->fastopen_rsk = NULL; | ||
789 | sock_put(sk); | ||
790 | } | ||
757 | inet_csk_destroy_sock(child); | 791 | inet_csk_destroy_sock(child); |
758 | 792 | ||
759 | bh_unlock_sock(child); | 793 | bh_unlock_sock(child); |
@@ -763,6 +797,17 @@ void inet_csk_listen_stop(struct sock *sk) | |||
763 | sk_acceptq_removed(sk); | 797 | sk_acceptq_removed(sk); |
764 | __reqsk_free(req); | 798 | __reqsk_free(req); |
765 | } | 799 | } |
800 | if (queue->fastopenq != NULL) { | ||
801 | /* Free all the reqs queued in rskq_rst_head. */ | ||
802 | spin_lock_bh(&queue->fastopenq->lock); | ||
803 | acc_req = queue->fastopenq->rskq_rst_head; | ||
804 | queue->fastopenq->rskq_rst_head = NULL; | ||
805 | spin_unlock_bh(&queue->fastopenq->lock); | ||
806 | while ((req = acc_req) != NULL) { | ||
807 | acc_req = req->dl_next; | ||
808 | __reqsk_free(req); | ||
809 | } | ||
810 | } | ||
766 | WARN_ON(sk->sk_ack_backlog); | 811 | WARN_ON(sk->sk_ack_backlog); |
767 | } | 812 | } |
768 | EXPORT_SYMBOL_GPL(inet_csk_listen_stop); | 813 | EXPORT_SYMBOL_GPL(inet_csk_listen_stop); |
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 570e61f9611f..8bc005b1435f 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c | |||
@@ -69,6 +69,7 @@ static inline void inet_diag_unlock_handler( | |||
69 | 69 | ||
70 | int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, | 70 | int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, |
71 | struct sk_buff *skb, struct inet_diag_req_v2 *req, | 71 | struct sk_buff *skb, struct inet_diag_req_v2 *req, |
72 | struct user_namespace *user_ns, | ||
72 | u32 pid, u32 seq, u16 nlmsg_flags, | 73 | u32 pid, u32 seq, u16 nlmsg_flags, |
73 | const struct nlmsghdr *unlh) | 74 | const struct nlmsghdr *unlh) |
74 | { | 75 | { |
@@ -124,7 +125,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, | |||
124 | } | 125 | } |
125 | #endif | 126 | #endif |
126 | 127 | ||
127 | r->idiag_uid = sock_i_uid(sk); | 128 | r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); |
128 | r->idiag_inode = sock_i_ino(sk); | 129 | r->idiag_inode = sock_i_ino(sk); |
129 | 130 | ||
130 | if (ext & (1 << (INET_DIAG_MEMINFO - 1))) { | 131 | if (ext & (1 << (INET_DIAG_MEMINFO - 1))) { |
@@ -199,11 +200,12 @@ EXPORT_SYMBOL_GPL(inet_sk_diag_fill); | |||
199 | 200 | ||
200 | static int inet_csk_diag_fill(struct sock *sk, | 201 | static int inet_csk_diag_fill(struct sock *sk, |
201 | struct sk_buff *skb, struct inet_diag_req_v2 *req, | 202 | struct sk_buff *skb, struct inet_diag_req_v2 *req, |
203 | struct user_namespace *user_ns, | ||
202 | u32 pid, u32 seq, u16 nlmsg_flags, | 204 | u32 pid, u32 seq, u16 nlmsg_flags, |
203 | const struct nlmsghdr *unlh) | 205 | const struct nlmsghdr *unlh) |
204 | { | 206 | { |
205 | return inet_sk_diag_fill(sk, inet_csk(sk), | 207 | return inet_sk_diag_fill(sk, inet_csk(sk), |
206 | skb, req, pid, seq, nlmsg_flags, unlh); | 208 | skb, req, user_ns, pid, seq, nlmsg_flags, unlh); |
207 | } | 209 | } |
208 | 210 | ||
209 | static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, | 211 | static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, |
@@ -256,14 +258,16 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, | |||
256 | } | 258 | } |
257 | 259 | ||
258 | static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, | 260 | static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, |
259 | struct inet_diag_req_v2 *r, u32 pid, u32 seq, u16 nlmsg_flags, | 261 | struct inet_diag_req_v2 *r, |
262 | struct user_namespace *user_ns, | ||
263 | u32 pid, u32 seq, u16 nlmsg_flags, | ||
260 | const struct nlmsghdr *unlh) | 264 | const struct nlmsghdr *unlh) |
261 | { | 265 | { |
262 | if (sk->sk_state == TCP_TIME_WAIT) | 266 | if (sk->sk_state == TCP_TIME_WAIT) |
263 | return inet_twsk_diag_fill((struct inet_timewait_sock *)sk, | 267 | return inet_twsk_diag_fill((struct inet_timewait_sock *)sk, |
264 | skb, r, pid, seq, nlmsg_flags, | 268 | skb, r, pid, seq, nlmsg_flags, |
265 | unlh); | 269 | unlh); |
266 | return inet_csk_diag_fill(sk, skb, r, pid, seq, nlmsg_flags, unlh); | 270 | return inet_csk_diag_fill(sk, skb, r, user_ns, pid, seq, nlmsg_flags, unlh); |
267 | } | 271 | } |
268 | 272 | ||
269 | int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb, | 273 | int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb, |
@@ -311,6 +315,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s | |||
311 | } | 315 | } |
312 | 316 | ||
313 | err = sk_diag_fill(sk, rep, req, | 317 | err = sk_diag_fill(sk, rep, req, |
318 | sk_user_ns(NETLINK_CB(in_skb).ssk), | ||
314 | NETLINK_CB(in_skb).pid, | 319 | NETLINK_CB(in_skb).pid, |
315 | nlh->nlmsg_seq, 0, nlh); | 320 | nlh->nlmsg_seq, 0, nlh); |
316 | if (err < 0) { | 321 | if (err < 0) { |
@@ -551,6 +556,7 @@ static int inet_csk_diag_dump(struct sock *sk, | |||
551 | return 0; | 556 | return 0; |
552 | 557 | ||
553 | return inet_csk_diag_fill(sk, skb, r, | 558 | return inet_csk_diag_fill(sk, skb, r, |
559 | sk_user_ns(NETLINK_CB(cb->skb).ssk), | ||
554 | NETLINK_CB(cb->skb).pid, | 560 | NETLINK_CB(cb->skb).pid, |
555 | cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); | 561 | cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); |
556 | } | 562 | } |
@@ -591,7 +597,9 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw, | |||
591 | } | 597 | } |
592 | 598 | ||
593 | static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, | 599 | static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, |
594 | struct request_sock *req, u32 pid, u32 seq, | 600 | struct request_sock *req, |
601 | struct user_namespace *user_ns, | ||
602 | u32 pid, u32 seq, | ||
595 | const struct nlmsghdr *unlh) | 603 | const struct nlmsghdr *unlh) |
596 | { | 604 | { |
597 | const struct inet_request_sock *ireq = inet_rsk(req); | 605 | const struct inet_request_sock *ireq = inet_rsk(req); |
@@ -625,7 +633,7 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, | |||
625 | r->idiag_expires = jiffies_to_msecs(tmo); | 633 | r->idiag_expires = jiffies_to_msecs(tmo); |
626 | r->idiag_rqueue = 0; | 634 | r->idiag_rqueue = 0; |
627 | r->idiag_wqueue = 0; | 635 | r->idiag_wqueue = 0; |
628 | r->idiag_uid = sock_i_uid(sk); | 636 | r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); |
629 | r->idiag_inode = 0; | 637 | r->idiag_inode = 0; |
630 | #if IS_ENABLED(CONFIG_IPV6) | 638 | #if IS_ENABLED(CONFIG_IPV6) |
631 | if (r->idiag_family == AF_INET6) { | 639 | if (r->idiag_family == AF_INET6) { |
@@ -702,6 +710,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, | |||
702 | } | 710 | } |
703 | 711 | ||
704 | err = inet_diag_fill_req(skb, sk, req, | 712 | err = inet_diag_fill_req(skb, sk, req, |
713 | sk_user_ns(NETLINK_CB(cb->skb).ssk), | ||
705 | NETLINK_CB(cb->skb).pid, | 714 | NETLINK_CB(cb->skb).pid, |
706 | cb->nlh->nlmsg_seq, cb->nlh); | 715 | cb->nlh->nlmsg_seq, cb->nlh); |
707 | if (err < 0) { | 716 | if (err < 0) { |
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 3a57570c8ee5..8aa7a4cf9139 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c | |||
@@ -124,6 +124,8 @@ static DEFINE_SPINLOCK(mfc_unres_lock); | |||
124 | static struct kmem_cache *mrt_cachep __read_mostly; | 124 | static struct kmem_cache *mrt_cachep __read_mostly; |
125 | 125 | ||
126 | static struct mr_table *ipmr_new_table(struct net *net, u32 id); | 126 | static struct mr_table *ipmr_new_table(struct net *net, u32 id); |
127 | static void ipmr_free_table(struct mr_table *mrt); | ||
128 | |||
127 | static int ip_mr_forward(struct net *net, struct mr_table *mrt, | 129 | static int ip_mr_forward(struct net *net, struct mr_table *mrt, |
128 | struct sk_buff *skb, struct mfc_cache *cache, | 130 | struct sk_buff *skb, struct mfc_cache *cache, |
129 | int local); | 131 | int local); |
@@ -131,6 +133,7 @@ static int ipmr_cache_report(struct mr_table *mrt, | |||
131 | struct sk_buff *pkt, vifi_t vifi, int assert); | 133 | struct sk_buff *pkt, vifi_t vifi, int assert); |
132 | static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, | 134 | static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, |
133 | struct mfc_cache *c, struct rtmsg *rtm); | 135 | struct mfc_cache *c, struct rtmsg *rtm); |
136 | static void mroute_clean_tables(struct mr_table *mrt); | ||
134 | static void ipmr_expire_process(unsigned long arg); | 137 | static void ipmr_expire_process(unsigned long arg); |
135 | 138 | ||
136 | #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES | 139 | #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES |
@@ -271,7 +274,7 @@ static void __net_exit ipmr_rules_exit(struct net *net) | |||
271 | 274 | ||
272 | list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) { | 275 | list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) { |
273 | list_del(&mrt->list); | 276 | list_del(&mrt->list); |
274 | kfree(mrt); | 277 | ipmr_free_table(mrt); |
275 | } | 278 | } |
276 | fib_rules_unregister(net->ipv4.mr_rules_ops); | 279 | fib_rules_unregister(net->ipv4.mr_rules_ops); |
277 | } | 280 | } |
@@ -299,7 +302,7 @@ static int __net_init ipmr_rules_init(struct net *net) | |||
299 | 302 | ||
300 | static void __net_exit ipmr_rules_exit(struct net *net) | 303 | static void __net_exit ipmr_rules_exit(struct net *net) |
301 | { | 304 | { |
302 | kfree(net->ipv4.mrt); | 305 | ipmr_free_table(net->ipv4.mrt); |
303 | } | 306 | } |
304 | #endif | 307 | #endif |
305 | 308 | ||
@@ -336,6 +339,13 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id) | |||
336 | return mrt; | 339 | return mrt; |
337 | } | 340 | } |
338 | 341 | ||
342 | static void ipmr_free_table(struct mr_table *mrt) | ||
343 | { | ||
344 | del_timer_sync(&mrt->ipmr_expire_timer); | ||
345 | mroute_clean_tables(mrt); | ||
346 | kfree(mrt); | ||
347 | } | ||
348 | |||
339 | /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ | 349 | /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ |
340 | 350 | ||
341 | static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v) | 351 | static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v) |
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 6232d476f37e..8f3d05424a3e 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c | |||
@@ -185,10 +185,10 @@ exit: | |||
185 | return sk; | 185 | return sk; |
186 | } | 186 | } |
187 | 187 | ||
188 | static void inet_get_ping_group_range_net(struct net *net, gid_t *low, | 188 | static void inet_get_ping_group_range_net(struct net *net, kgid_t *low, |
189 | gid_t *high) | 189 | kgid_t *high) |
190 | { | 190 | { |
191 | gid_t *data = net->ipv4.sysctl_ping_group_range; | 191 | kgid_t *data = net->ipv4.sysctl_ping_group_range; |
192 | unsigned int seq; | 192 | unsigned int seq; |
193 | 193 | ||
194 | do { | 194 | do { |
@@ -203,19 +203,13 @@ static void inet_get_ping_group_range_net(struct net *net, gid_t *low, | |||
203 | static int ping_init_sock(struct sock *sk) | 203 | static int ping_init_sock(struct sock *sk) |
204 | { | 204 | { |
205 | struct net *net = sock_net(sk); | 205 | struct net *net = sock_net(sk); |
206 | gid_t group = current_egid(); | 206 | kgid_t group = current_egid(); |
207 | gid_t range[2]; | ||
208 | struct group_info *group_info = get_current_groups(); | 207 | struct group_info *group_info = get_current_groups(); |
209 | int i, j, count = group_info->ngroups; | 208 | int i, j, count = group_info->ngroups; |
210 | kgid_t low, high; | 209 | kgid_t low, high; |
211 | 210 | ||
212 | inet_get_ping_group_range_net(net, range, range+1); | 211 | inet_get_ping_group_range_net(net, &low, &high); |
213 | low = make_kgid(&init_user_ns, range[0]); | 212 | if (gid_lte(low, group) && gid_lte(group, high)) |
214 | high = make_kgid(&init_user_ns, range[1]); | ||
215 | if (!gid_valid(low) || !gid_valid(high) || gid_lt(high, low)) | ||
216 | return -EACCES; | ||
217 | |||
218 | if (range[0] <= group && group <= range[1]) | ||
219 | return 0; | 213 | return 0; |
220 | 214 | ||
221 | for (i = 0; i < group_info->nblocks; i++) { | 215 | for (i = 0; i < group_info->nblocks; i++) { |
@@ -845,7 +839,9 @@ static void ping_format_sock(struct sock *sp, struct seq_file *f, | |||
845 | bucket, src, srcp, dest, destp, sp->sk_state, | 839 | bucket, src, srcp, dest, destp, sp->sk_state, |
846 | sk_wmem_alloc_get(sp), | 840 | sk_wmem_alloc_get(sp), |
847 | sk_rmem_alloc_get(sp), | 841 | sk_rmem_alloc_get(sp), |
848 | 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), | 842 | 0, 0L, 0, |
843 | from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)), | ||
844 | 0, sock_i_ino(sp), | ||
849 | atomic_read(&sp->sk_refcnt), sp, | 845 | atomic_read(&sp->sk_refcnt), sp, |
850 | atomic_read(&sp->sk_drops), len); | 846 | atomic_read(&sp->sk_drops), len); |
851 | } | 847 | } |
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 957acd12250b..8de53e1ddd54 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c | |||
@@ -263,6 +263,10 @@ static const struct snmp_mib snmp4_net_list[] = { | |||
263 | SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK), | 263 | SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK), |
264 | SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE), | 264 | SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE), |
265 | SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE), | 265 | SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE), |
266 | SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE), | ||
267 | SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL), | ||
268 | SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), | ||
269 | SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), | ||
266 | SNMP_MIB_SENTINEL | 270 | SNMP_MIB_SENTINEL |
267 | }; | 271 | }; |
268 | 272 | ||
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index ff0f071969ea..f2425785d40a 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c | |||
@@ -992,7 +992,9 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) | |||
992 | i, src, srcp, dest, destp, sp->sk_state, | 992 | i, src, srcp, dest, destp, sp->sk_state, |
993 | sk_wmem_alloc_get(sp), | 993 | sk_wmem_alloc_get(sp), |
994 | sk_rmem_alloc_get(sp), | 994 | sk_rmem_alloc_get(sp), |
995 | 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), | 995 | 0, 0L, 0, |
996 | from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), | ||
997 | 0, sock_i_ino(sp), | ||
996 | atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); | 998 | atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); |
997 | } | 999 | } |
998 | 1000 | ||
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 50f6d3adb474..dc9549b5eb1c 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c | |||
@@ -934,12 +934,14 @@ static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) | |||
934 | if (mtu < ip_rt_min_pmtu) | 934 | if (mtu < ip_rt_min_pmtu) |
935 | mtu = ip_rt_min_pmtu; | 935 | mtu = ip_rt_min_pmtu; |
936 | 936 | ||
937 | rcu_read_lock(); | ||
937 | if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) { | 938 | if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) { |
938 | struct fib_nh *nh = &FIB_RES_NH(res); | 939 | struct fib_nh *nh = &FIB_RES_NH(res); |
939 | 940 | ||
940 | update_or_create_fnhe(nh, fl4->daddr, 0, mtu, | 941 | update_or_create_fnhe(nh, fl4->daddr, 0, mtu, |
941 | jiffies + ip_rt_mtu_expires); | 942 | jiffies + ip_rt_mtu_expires); |
942 | } | 943 | } |
944 | rcu_read_unlock(); | ||
943 | return mtu; | 945 | return mtu; |
944 | } | 946 | } |
945 | 947 | ||
@@ -956,7 +958,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, | |||
956 | dst->obsolete = DST_OBSOLETE_KILL; | 958 | dst->obsolete = DST_OBSOLETE_KILL; |
957 | } else { | 959 | } else { |
958 | rt->rt_pmtu = mtu; | 960 | rt->rt_pmtu = mtu; |
959 | dst_set_expires(&rt->dst, ip_rt_mtu_expires); | 961 | rt->dst.expires = max(1UL, jiffies + ip_rt_mtu_expires); |
960 | } | 962 | } |
961 | } | 963 | } |
962 | 964 | ||
@@ -1132,10 +1134,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) | |||
1132 | const struct rtable *rt = (const struct rtable *) dst; | 1134 | const struct rtable *rt = (const struct rtable *) dst; |
1133 | unsigned int mtu = rt->rt_pmtu; | 1135 | unsigned int mtu = rt->rt_pmtu; |
1134 | 1136 | ||
1135 | if (mtu && time_after_eq(jiffies, rt->dst.expires)) | 1137 | if (!mtu || time_after_eq(jiffies, rt->dst.expires)) |
1136 | mtu = 0; | ||
1137 | |||
1138 | if (!mtu) | ||
1139 | mtu = dst_metric_raw(dst, RTAX_MTU); | 1138 | mtu = dst_metric_raw(dst, RTAX_MTU); |
1140 | 1139 | ||
1141 | if (mtu && rt_is_output_route(rt)) | 1140 | if (mtu && rt_is_output_route(rt)) |
@@ -1263,7 +1262,7 @@ static void ipv4_dst_destroy(struct dst_entry *dst) | |||
1263 | { | 1262 | { |
1264 | struct rtable *rt = (struct rtable *) dst; | 1263 | struct rtable *rt = (struct rtable *) dst; |
1265 | 1264 | ||
1266 | if (dst->flags & DST_NOCACHE) { | 1265 | if (!list_empty(&rt->rt_uncached)) { |
1267 | spin_lock_bh(&rt_uncached_lock); | 1266 | spin_lock_bh(&rt_uncached_lock); |
1268 | list_del(&rt->rt_uncached); | 1267 | list_del(&rt->rt_uncached); |
1269 | spin_unlock_bh(&rt_uncached_lock); | 1268 | spin_unlock_bh(&rt_uncached_lock); |
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 650e1528e1e6..ba48e799b031 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c | |||
@@ -319,6 +319,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, | |||
319 | ireq->tstamp_ok = tcp_opt.saw_tstamp; | 319 | ireq->tstamp_ok = tcp_opt.saw_tstamp; |
320 | req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; | 320 | req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; |
321 | treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; | 321 | treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; |
322 | treq->listener = NULL; | ||
322 | 323 | ||
323 | /* We throwed the options of the initial SYN away, so we hope | 324 | /* We throwed the options of the initial SYN away, so we hope |
324 | * the ACK carries the same options again (see RFC1122 4.2.3.8) | 325 | * the ACK carries the same options again (see RFC1122 4.2.3.8) |
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 1b5ce96707a3..9205e492dc9d 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
@@ -76,9 +76,9 @@ static int ipv4_local_port_range(ctl_table *table, int write, | |||
76 | } | 76 | } |
77 | 77 | ||
78 | 78 | ||
79 | static void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low, gid_t *high) | 79 | static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high) |
80 | { | 80 | { |
81 | gid_t *data = table->data; | 81 | kgid_t *data = table->data; |
82 | unsigned int seq; | 82 | unsigned int seq; |
83 | do { | 83 | do { |
84 | seq = read_seqbegin(&sysctl_local_ports.lock); | 84 | seq = read_seqbegin(&sysctl_local_ports.lock); |
@@ -89,12 +89,12 @@ static void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low, | |||
89 | } | 89 | } |
90 | 90 | ||
91 | /* Update system visible IP port range */ | 91 | /* Update system visible IP port range */ |
92 | static void set_ping_group_range(struct ctl_table *table, gid_t range[2]) | 92 | static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t high) |
93 | { | 93 | { |
94 | gid_t *data = table->data; | 94 | kgid_t *data = table->data; |
95 | write_seqlock(&sysctl_local_ports.lock); | 95 | write_seqlock(&sysctl_local_ports.lock); |
96 | data[0] = range[0]; | 96 | data[0] = low; |
97 | data[1] = range[1]; | 97 | data[1] = high; |
98 | write_sequnlock(&sysctl_local_ports.lock); | 98 | write_sequnlock(&sysctl_local_ports.lock); |
99 | } | 99 | } |
100 | 100 | ||
@@ -103,21 +103,33 @@ static int ipv4_ping_group_range(ctl_table *table, int write, | |||
103 | void __user *buffer, | 103 | void __user *buffer, |
104 | size_t *lenp, loff_t *ppos) | 104 | size_t *lenp, loff_t *ppos) |
105 | { | 105 | { |
106 | struct user_namespace *user_ns = current_user_ns(); | ||
106 | int ret; | 107 | int ret; |
107 | gid_t range[2]; | 108 | gid_t urange[2]; |
109 | kgid_t low, high; | ||
108 | ctl_table tmp = { | 110 | ctl_table tmp = { |
109 | .data = &range, | 111 | .data = &urange, |
110 | .maxlen = sizeof(range), | 112 | .maxlen = sizeof(urange), |
111 | .mode = table->mode, | 113 | .mode = table->mode, |
112 | .extra1 = &ip_ping_group_range_min, | 114 | .extra1 = &ip_ping_group_range_min, |
113 | .extra2 = &ip_ping_group_range_max, | 115 | .extra2 = &ip_ping_group_range_max, |
114 | }; | 116 | }; |
115 | 117 | ||
116 | inet_get_ping_group_range_table(table, range, range + 1); | 118 | inet_get_ping_group_range_table(table, &low, &high); |
119 | urange[0] = from_kgid_munged(user_ns, low); | ||
120 | urange[1] = from_kgid_munged(user_ns, high); | ||
117 | ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); | 121 | ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); |
118 | 122 | ||
119 | if (write && ret == 0) | 123 | if (write && ret == 0) { |
120 | set_ping_group_range(table, range); | 124 | low = make_kgid(user_ns, urange[0]); |
125 | high = make_kgid(user_ns, urange[1]); | ||
126 | if (!gid_valid(low) || !gid_valid(high) || | ||
127 | (urange[1] < urange[0]) || gid_lt(high, low)) { | ||
128 | low = make_kgid(&init_user_ns, 1); | ||
129 | high = make_kgid(&init_user_ns, 0); | ||
130 | } | ||
131 | set_ping_group_range(table, low, high); | ||
132 | } | ||
121 | 133 | ||
122 | return ret; | 134 | return ret; |
123 | } | 135 | } |
@@ -220,6 +232,45 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write, | |||
220 | return 0; | 232 | return 0; |
221 | } | 233 | } |
222 | 234 | ||
235 | int proc_tcp_fastopen_key(ctl_table *ctl, int write, void __user *buffer, | ||
236 | size_t *lenp, loff_t *ppos) | ||
237 | { | ||
238 | ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) }; | ||
239 | struct tcp_fastopen_context *ctxt; | ||
240 | int ret; | ||
241 | u32 user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */ | ||
242 | |||
243 | tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL); | ||
244 | if (!tbl.data) | ||
245 | return -ENOMEM; | ||
246 | |||
247 | rcu_read_lock(); | ||
248 | ctxt = rcu_dereference(tcp_fastopen_ctx); | ||
249 | if (ctxt) | ||
250 | memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH); | ||
251 | rcu_read_unlock(); | ||
252 | |||
253 | snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x", | ||
254 | user_key[0], user_key[1], user_key[2], user_key[3]); | ||
255 | ret = proc_dostring(&tbl, write, buffer, lenp, ppos); | ||
256 | |||
257 | if (write && ret == 0) { | ||
258 | if (sscanf(tbl.data, "%x-%x-%x-%x", user_key, user_key + 1, | ||
259 | user_key + 2, user_key + 3) != 4) { | ||
260 | ret = -EINVAL; | ||
261 | goto bad_key; | ||
262 | } | ||
263 | tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH); | ||
264 | } | ||
265 | |||
266 | bad_key: | ||
267 | pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n", | ||
268 | user_key[0], user_key[1], user_key[2], user_key[3], | ||
269 | (char *)tbl.data, ret); | ||
270 | kfree(tbl.data); | ||
271 | return ret; | ||
272 | } | ||
273 | |||
223 | static struct ctl_table ipv4_table[] = { | 274 | static struct ctl_table ipv4_table[] = { |
224 | { | 275 | { |
225 | .procname = "tcp_timestamps", | 276 | .procname = "tcp_timestamps", |
@@ -374,6 +425,12 @@ static struct ctl_table ipv4_table[] = { | |||
374 | .proc_handler = proc_dointvec, | 425 | .proc_handler = proc_dointvec, |
375 | }, | 426 | }, |
376 | { | 427 | { |
428 | .procname = "tcp_fastopen_key", | ||
429 | .mode = 0600, | ||
430 | .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10), | ||
431 | .proc_handler = proc_tcp_fastopen_key, | ||
432 | }, | ||
433 | { | ||
377 | .procname = "tcp_tw_recycle", | 434 | .procname = "tcp_tw_recycle", |
378 | .data = &tcp_death_row.sysctl_tw_recycle, | 435 | .data = &tcp_death_row.sysctl_tw_recycle, |
379 | .maxlen = sizeof(int), | 436 | .maxlen = sizeof(int), |
@@ -786,7 +843,7 @@ static struct ctl_table ipv4_net_table[] = { | |||
786 | { | 843 | { |
787 | .procname = "ping_group_range", | 844 | .procname = "ping_group_range", |
788 | .data = &init_net.ipv4.sysctl_ping_group_range, | 845 | .data = &init_net.ipv4.sysctl_ping_group_range, |
789 | .maxlen = sizeof(init_net.ipv4.sysctl_ping_group_range), | 846 | .maxlen = sizeof(gid_t)*2, |
790 | .mode = 0644, | 847 | .mode = 0644, |
791 | .proc_handler = ipv4_ping_group_range, | 848 | .proc_handler = ipv4_ping_group_range, |
792 | }, | 849 | }, |
@@ -830,8 +887,8 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) | |||
830 | * Sane defaults - nobody may create ping sockets. | 887 | * Sane defaults - nobody may create ping sockets. |
831 | * Boot scripts should set this to distro-specific group. | 888 | * Boot scripts should set this to distro-specific group. |
832 | */ | 889 | */ |
833 | net->ipv4.sysctl_ping_group_range[0] = 1; | 890 | net->ipv4.sysctl_ping_group_range[0] = make_kgid(&init_user_ns, 1); |
834 | net->ipv4.sysctl_ping_group_range[1] = 0; | 891 | net->ipv4.sysctl_ping_group_range[1] = make_kgid(&init_user_ns, 0); |
835 | 892 | ||
836 | tcp_init_mem(net); | 893 | tcp_init_mem(net); |
837 | 894 | ||
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2109ff4a1daf..df83d744e380 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -486,8 +486,9 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) | |||
486 | if (sk->sk_shutdown & RCV_SHUTDOWN) | 486 | if (sk->sk_shutdown & RCV_SHUTDOWN) |
487 | mask |= POLLIN | POLLRDNORM | POLLRDHUP; | 487 | mask |= POLLIN | POLLRDNORM | POLLRDHUP; |
488 | 488 | ||
489 | /* Connected? */ | 489 | /* Connected or passive Fast Open socket? */ |
490 | if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { | 490 | if (sk->sk_state != TCP_SYN_SENT && |
491 | (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) { | ||
491 | int target = sock_rcvlowat(sk, 0, INT_MAX); | 492 | int target = sock_rcvlowat(sk, 0, INT_MAX); |
492 | 493 | ||
493 | if (tp->urg_seq == tp->copied_seq && | 494 | if (tp->urg_seq == tp->copied_seq && |
@@ -840,10 +841,15 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse | |||
840 | ssize_t copied; | 841 | ssize_t copied; |
841 | long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); | 842 | long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); |
842 | 843 | ||
843 | /* Wait for a connection to finish. */ | 844 | /* Wait for a connection to finish. One exception is TCP Fast Open |
844 | if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) | 845 | * (passive side) where data is allowed to be sent before a connection |
846 | * is fully established. | ||
847 | */ | ||
848 | if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && | ||
849 | !tcp_passive_fastopen(sk)) { | ||
845 | if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) | 850 | if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) |
846 | goto out_err; | 851 | goto out_err; |
852 | } | ||
847 | 853 | ||
848 | clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); | 854 | clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); |
849 | 855 | ||
@@ -1042,10 +1048,15 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
1042 | 1048 | ||
1043 | timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); | 1049 | timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); |
1044 | 1050 | ||
1045 | /* Wait for a connection to finish. */ | 1051 | /* Wait for a connection to finish. One exception is TCP Fast Open |
1046 | if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) | 1052 | * (passive side) where data is allowed to be sent before a connection |
1053 | * is fully established. | ||
1054 | */ | ||
1055 | if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && | ||
1056 | !tcp_passive_fastopen(sk)) { | ||
1047 | if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) | 1057 | if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) |
1048 | goto do_error; | 1058 | goto do_error; |
1059 | } | ||
1049 | 1060 | ||
1050 | if (unlikely(tp->repair)) { | 1061 | if (unlikely(tp->repair)) { |
1051 | if (tp->repair_queue == TCP_RECV_QUEUE) { | 1062 | if (tp->repair_queue == TCP_RECV_QUEUE) { |
@@ -2144,6 +2155,10 @@ void tcp_close(struct sock *sk, long timeout) | |||
2144 | * they look as CLOSING or LAST_ACK for Linux) | 2155 | * they look as CLOSING or LAST_ACK for Linux) |
2145 | * Probably, I missed some more holelets. | 2156 | * Probably, I missed some more holelets. |
2146 | * --ANK | 2157 | * --ANK |
2158 | * XXX (TFO) - To start off we don't support SYN+ACK+FIN | ||
2159 | * in a single packet! (May consider it later but will | ||
2160 | * probably need API support or TCP_CORK SYN-ACK until | ||
2161 | * data is written and socket is closed.) | ||
2147 | */ | 2162 | */ |
2148 | tcp_send_fin(sk); | 2163 | tcp_send_fin(sk); |
2149 | } | 2164 | } |
@@ -2215,8 +2230,16 @@ adjudge_to_death: | |||
2215 | } | 2230 | } |
2216 | } | 2231 | } |
2217 | 2232 | ||
2218 | if (sk->sk_state == TCP_CLOSE) | 2233 | if (sk->sk_state == TCP_CLOSE) { |
2234 | struct request_sock *req = tcp_sk(sk)->fastopen_rsk; | ||
2235 | /* We could get here with a non-NULL req if the socket is | ||
2236 | * aborted (e.g., closed with unread data) before 3WHS | ||
2237 | * finishes. | ||
2238 | */ | ||
2239 | if (req != NULL) | ||
2240 | reqsk_fastopen_remove(sk, req, false); | ||
2219 | inet_csk_destroy_sock(sk); | 2241 | inet_csk_destroy_sock(sk); |
2242 | } | ||
2220 | /* Otherwise, socket is reprieved until protocol close. */ | 2243 | /* Otherwise, socket is reprieved until protocol close. */ |
2221 | 2244 | ||
2222 | out: | 2245 | out: |
@@ -2688,6 +2711,14 @@ static int do_tcp_setsockopt(struct sock *sk, int level, | |||
2688 | else | 2711 | else |
2689 | icsk->icsk_user_timeout = msecs_to_jiffies(val); | 2712 | icsk->icsk_user_timeout = msecs_to_jiffies(val); |
2690 | break; | 2713 | break; |
2714 | |||
2715 | case TCP_FASTOPEN: | ||
2716 | if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | | ||
2717 | TCPF_LISTEN))) | ||
2718 | err = fastopen_init_queue(sk, val); | ||
2719 | else | ||
2720 | err = -EINVAL; | ||
2721 | break; | ||
2691 | default: | 2722 | default: |
2692 | err = -ENOPROTOOPT; | 2723 | err = -ENOPROTOOPT; |
2693 | break; | 2724 | break; |
@@ -3501,11 +3532,15 @@ EXPORT_SYMBOL(tcp_cookie_generator); | |||
3501 | 3532 | ||
3502 | void tcp_done(struct sock *sk) | 3533 | void tcp_done(struct sock *sk) |
3503 | { | 3534 | { |
3535 | struct request_sock *req = tcp_sk(sk)->fastopen_rsk; | ||
3536 | |||
3504 | if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) | 3537 | if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) |
3505 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); | 3538 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); |
3506 | 3539 | ||
3507 | tcp_set_state(sk, TCP_CLOSE); | 3540 | tcp_set_state(sk, TCP_CLOSE); |
3508 | tcp_clear_xmit_timers(sk); | 3541 | tcp_clear_xmit_timers(sk); |
3542 | if (req != NULL) | ||
3543 | reqsk_fastopen_remove(sk, req, false); | ||
3509 | 3544 | ||
3510 | sk->sk_shutdown = SHUTDOWN_MASK; | 3545 | sk->sk_shutdown = SHUTDOWN_MASK; |
3511 | 3546 | ||
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index a7f729c409d7..8f7ef0ad80e5 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c | |||
@@ -1,10 +1,91 @@ | |||
1 | #include <linux/err.h> | ||
1 | #include <linux/init.h> | 2 | #include <linux/init.h> |
2 | #include <linux/kernel.h> | 3 | #include <linux/kernel.h> |
4 | #include <linux/list.h> | ||
5 | #include <linux/tcp.h> | ||
6 | #include <linux/rcupdate.h> | ||
7 | #include <linux/rculist.h> | ||
8 | #include <net/inetpeer.h> | ||
9 | #include <net/tcp.h> | ||
3 | 10 | ||
4 | int sysctl_tcp_fastopen; | 11 | int sysctl_tcp_fastopen __read_mostly; |
12 | |||
13 | struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; | ||
14 | |||
15 | static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock); | ||
16 | |||
17 | static void tcp_fastopen_ctx_free(struct rcu_head *head) | ||
18 | { | ||
19 | struct tcp_fastopen_context *ctx = | ||
20 | container_of(head, struct tcp_fastopen_context, rcu); | ||
21 | crypto_free_cipher(ctx->tfm); | ||
22 | kfree(ctx); | ||
23 | } | ||
24 | |||
25 | int tcp_fastopen_reset_cipher(void *key, unsigned int len) | ||
26 | { | ||
27 | int err; | ||
28 | struct tcp_fastopen_context *ctx, *octx; | ||
29 | |||
30 | ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); | ||
31 | if (!ctx) | ||
32 | return -ENOMEM; | ||
33 | ctx->tfm = crypto_alloc_cipher("aes", 0, 0); | ||
34 | |||
35 | if (IS_ERR(ctx->tfm)) { | ||
36 | err = PTR_ERR(ctx->tfm); | ||
37 | error: kfree(ctx); | ||
38 | pr_err("TCP: TFO aes cipher alloc error: %d\n", err); | ||
39 | return err; | ||
40 | } | ||
41 | err = crypto_cipher_setkey(ctx->tfm, key, len); | ||
42 | if (err) { | ||
43 | pr_err("TCP: TFO cipher key error: %d\n", err); | ||
44 | crypto_free_cipher(ctx->tfm); | ||
45 | goto error; | ||
46 | } | ||
47 | memcpy(ctx->key, key, len); | ||
48 | |||
49 | spin_lock(&tcp_fastopen_ctx_lock); | ||
50 | |||
51 | octx = rcu_dereference_protected(tcp_fastopen_ctx, | ||
52 | lockdep_is_held(&tcp_fastopen_ctx_lock)); | ||
53 | rcu_assign_pointer(tcp_fastopen_ctx, ctx); | ||
54 | spin_unlock(&tcp_fastopen_ctx_lock); | ||
55 | |||
56 | if (octx) | ||
57 | call_rcu(&octx->rcu, tcp_fastopen_ctx_free); | ||
58 | return err; | ||
59 | } | ||
60 | |||
61 | /* Computes the fastopen cookie for the peer. | ||
62 | * The peer address is a 128 bits long (pad with zeros for IPv4). | ||
63 | * | ||
64 | * The caller must check foc->len to determine if a valid cookie | ||
65 | * has been generated successfully. | ||
66 | */ | ||
67 | void tcp_fastopen_cookie_gen(__be32 addr, struct tcp_fastopen_cookie *foc) | ||
68 | { | ||
69 | __be32 peer_addr[4] = { addr, 0, 0, 0 }; | ||
70 | struct tcp_fastopen_context *ctx; | ||
71 | |||
72 | rcu_read_lock(); | ||
73 | ctx = rcu_dereference(tcp_fastopen_ctx); | ||
74 | if (ctx) { | ||
75 | crypto_cipher_encrypt_one(ctx->tfm, | ||
76 | foc->val, | ||
77 | (__u8 *)peer_addr); | ||
78 | foc->len = TCP_FASTOPEN_COOKIE_SIZE; | ||
79 | } | ||
80 | rcu_read_unlock(); | ||
81 | } | ||
5 | 82 | ||
6 | static int __init tcp_fastopen_init(void) | 83 | static int __init tcp_fastopen_init(void) |
7 | { | 84 | { |
85 | __u8 key[TCP_FASTOPEN_KEY_LENGTH]; | ||
86 | |||
87 | get_random_bytes(key, sizeof(key)); | ||
88 | tcp_fastopen_reset_cipher(key, sizeof(key)); | ||
8 | return 0; | 89 | return 0; |
9 | } | 90 | } |
10 | 91 | ||
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bcfccc5cb8d0..8c304a400798 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -378,7 +378,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) | |||
378 | /* 4. Try to fixup all. It is made immediately after connection enters | 378 | /* 4. Try to fixup all. It is made immediately after connection enters |
379 | * established state. | 379 | * established state. |
380 | */ | 380 | */ |
381 | static void tcp_init_buffer_space(struct sock *sk) | 381 | void tcp_init_buffer_space(struct sock *sk) |
382 | { | 382 | { |
383 | struct tcp_sock *tp = tcp_sk(sk); | 383 | struct tcp_sock *tp = tcp_sk(sk); |
384 | int maxwin; | 384 | int maxwin; |
@@ -2930,13 +2930,14 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) | |||
2930 | * tcp_xmit_retransmit_queue(). | 2930 | * tcp_xmit_retransmit_queue(). |
2931 | */ | 2931 | */ |
2932 | static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, | 2932 | static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, |
2933 | int newly_acked_sacked, bool is_dupack, | 2933 | int prior_sacked, bool is_dupack, |
2934 | int flag) | 2934 | int flag) |
2935 | { | 2935 | { |
2936 | struct inet_connection_sock *icsk = inet_csk(sk); | 2936 | struct inet_connection_sock *icsk = inet_csk(sk); |
2937 | struct tcp_sock *tp = tcp_sk(sk); | 2937 | struct tcp_sock *tp = tcp_sk(sk); |
2938 | int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && | 2938 | int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && |
2939 | (tcp_fackets_out(tp) > tp->reordering)); | 2939 | (tcp_fackets_out(tp) > tp->reordering)); |
2940 | int newly_acked_sacked = 0; | ||
2940 | int fast_rexmit = 0; | 2941 | int fast_rexmit = 0; |
2941 | 2942 | ||
2942 | if (WARN_ON(!tp->packets_out && tp->sacked_out)) | 2943 | if (WARN_ON(!tp->packets_out && tp->sacked_out)) |
@@ -2996,6 +2997,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, | |||
2996 | tcp_add_reno_sack(sk); | 2997 | tcp_add_reno_sack(sk); |
2997 | } else | 2998 | } else |
2998 | do_lost = tcp_try_undo_partial(sk, pkts_acked); | 2999 | do_lost = tcp_try_undo_partial(sk, pkts_acked); |
3000 | newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked; | ||
2999 | break; | 3001 | break; |
3000 | case TCP_CA_Loss: | 3002 | case TCP_CA_Loss: |
3001 | if (flag & FLAG_DATA_ACKED) | 3003 | if (flag & FLAG_DATA_ACKED) |
@@ -3017,6 +3019,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, | |||
3017 | if (is_dupack) | 3019 | if (is_dupack) |
3018 | tcp_add_reno_sack(sk); | 3020 | tcp_add_reno_sack(sk); |
3019 | } | 3021 | } |
3022 | newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked; | ||
3020 | 3023 | ||
3021 | if (icsk->icsk_ca_state <= TCP_CA_Disorder) | 3024 | if (icsk->icsk_ca_state <= TCP_CA_Disorder) |
3022 | tcp_try_undo_dsack(sk); | 3025 | tcp_try_undo_dsack(sk); |
@@ -3124,6 +3127,12 @@ void tcp_rearm_rto(struct sock *sk) | |||
3124 | { | 3127 | { |
3125 | struct tcp_sock *tp = tcp_sk(sk); | 3128 | struct tcp_sock *tp = tcp_sk(sk); |
3126 | 3129 | ||
3130 | /* If the retrans timer is currently being used by Fast Open | ||
3131 | * for SYN-ACK retrans purpose, stay put. | ||
3132 | */ | ||
3133 | if (tp->fastopen_rsk) | ||
3134 | return; | ||
3135 | |||
3127 | if (!tp->packets_out) { | 3136 | if (!tp->packets_out) { |
3128 | inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); | 3137 | inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); |
3129 | } else { | 3138 | } else { |
@@ -3594,7 +3603,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3594 | int prior_packets; | 3603 | int prior_packets; |
3595 | int prior_sacked = tp->sacked_out; | 3604 | int prior_sacked = tp->sacked_out; |
3596 | int pkts_acked = 0; | 3605 | int pkts_acked = 0; |
3597 | int newly_acked_sacked = 0; | ||
3598 | bool frto_cwnd = false; | 3606 | bool frto_cwnd = false; |
3599 | 3607 | ||
3600 | /* If the ack is older than previous acks | 3608 | /* If the ack is older than previous acks |
@@ -3670,8 +3678,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3670 | flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); | 3678 | flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); |
3671 | 3679 | ||
3672 | pkts_acked = prior_packets - tp->packets_out; | 3680 | pkts_acked = prior_packets - tp->packets_out; |
3673 | newly_acked_sacked = (prior_packets - prior_sacked) - | ||
3674 | (tp->packets_out - tp->sacked_out); | ||
3675 | 3681 | ||
3676 | if (tp->frto_counter) | 3682 | if (tp->frto_counter) |
3677 | frto_cwnd = tcp_process_frto(sk, flag); | 3683 | frto_cwnd = tcp_process_frto(sk, flag); |
@@ -3685,7 +3691,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3685 | tcp_may_raise_cwnd(sk, flag)) | 3691 | tcp_may_raise_cwnd(sk, flag)) |
3686 | tcp_cong_avoid(sk, ack, prior_in_flight); | 3692 | tcp_cong_avoid(sk, ack, prior_in_flight); |
3687 | is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); | 3693 | is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); |
3688 | tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked, | 3694 | tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, |
3689 | is_dupack, flag); | 3695 | is_dupack, flag); |
3690 | } else { | 3696 | } else { |
3691 | if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) | 3697 | if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) |
@@ -3702,7 +3708,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |||
3702 | no_queue: | 3708 | no_queue: |
3703 | /* If data was DSACKed, see if we can undo a cwnd reduction. */ | 3709 | /* If data was DSACKed, see if we can undo a cwnd reduction. */ |
3704 | if (flag & FLAG_DSACKING_ACK) | 3710 | if (flag & FLAG_DSACKING_ACK) |
3705 | tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked, | 3711 | tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, |
3706 | is_dupack, flag); | 3712 | is_dupack, flag); |
3707 | /* If this ack opens up a zero window, clear backoff. It was | 3713 | /* If this ack opens up a zero window, clear backoff. It was |
3708 | * being used to time the probes, and is probably far higher than | 3714 | * being used to time the probes, and is probably far higher than |
@@ -3722,8 +3728,7 @@ old_ack: | |||
3722 | */ | 3728 | */ |
3723 | if (TCP_SKB_CB(skb)->sacked) { | 3729 | if (TCP_SKB_CB(skb)->sacked) { |
3724 | flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); | 3730 | flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); |
3725 | newly_acked_sacked = tp->sacked_out - prior_sacked; | 3731 | tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, |
3726 | tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked, | ||
3727 | is_dupack, flag); | 3732 | is_dupack, flag); |
3728 | } | 3733 | } |
3729 | 3734 | ||
@@ -4039,7 +4044,7 @@ static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) | |||
4039 | } | 4044 | } |
4040 | 4045 | ||
4041 | /* When we get a reset we do this. */ | 4046 | /* When we get a reset we do this. */ |
4042 | static void tcp_reset(struct sock *sk) | 4047 | void tcp_reset(struct sock *sk) |
4043 | { | 4048 | { |
4044 | /* We want the right error as BSD sees it (and indeed as we do). */ | 4049 | /* We want the right error as BSD sees it (and indeed as we do). */ |
4045 | switch (sk->sk_state) { | 4050 | switch (sk->sk_state) { |
@@ -5896,7 +5901,9 @@ discard: | |||
5896 | tcp_send_synack(sk); | 5901 | tcp_send_synack(sk); |
5897 | #if 0 | 5902 | #if 0 |
5898 | /* Note, we could accept data and URG from this segment. | 5903 | /* Note, we could accept data and URG from this segment. |
5899 | * There are no obstacles to make this. | 5904 | * There are no obstacles to make this (except that we must |
5905 | * either change tcp_recvmsg() to prevent it from returning data | ||
5906 | * before 3WHS completes per RFC793, or employ TCP Fast Open). | ||
5900 | * | 5907 | * |
5901 | * However, if we ignore data in ACKless segments sometimes, | 5908 | * However, if we ignore data in ACKless segments sometimes, |
5902 | * we have no reasons to accept it sometimes. | 5909 | * we have no reasons to accept it sometimes. |
@@ -5936,6 +5943,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
5936 | { | 5943 | { |
5937 | struct tcp_sock *tp = tcp_sk(sk); | 5944 | struct tcp_sock *tp = tcp_sk(sk); |
5938 | struct inet_connection_sock *icsk = inet_csk(sk); | 5945 | struct inet_connection_sock *icsk = inet_csk(sk); |
5946 | struct request_sock *req; | ||
5939 | int queued = 0; | 5947 | int queued = 0; |
5940 | 5948 | ||
5941 | tp->rx_opt.saw_tstamp = 0; | 5949 | tp->rx_opt.saw_tstamp = 0; |
@@ -5991,7 +5999,14 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
5991 | return 0; | 5999 | return 0; |
5992 | } | 6000 | } |
5993 | 6001 | ||
5994 | if (!tcp_validate_incoming(sk, skb, th, 0)) | 6002 | req = tp->fastopen_rsk; |
6003 | if (req != NULL) { | ||
6004 | BUG_ON(sk->sk_state != TCP_SYN_RECV && | ||
6005 | sk->sk_state != TCP_FIN_WAIT1); | ||
6006 | |||
6007 | if (tcp_check_req(sk, skb, req, NULL, true) == NULL) | ||
6008 | goto discard; | ||
6009 | } else if (!tcp_validate_incoming(sk, skb, th, 0)) | ||
5995 | return 0; | 6010 | return 0; |
5996 | 6011 | ||
5997 | /* step 5: check the ACK field */ | 6012 | /* step 5: check the ACK field */ |
@@ -6001,7 +6016,22 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
6001 | switch (sk->sk_state) { | 6016 | switch (sk->sk_state) { |
6002 | case TCP_SYN_RECV: | 6017 | case TCP_SYN_RECV: |
6003 | if (acceptable) { | 6018 | if (acceptable) { |
6004 | tp->copied_seq = tp->rcv_nxt; | 6019 | /* Once we leave TCP_SYN_RECV, we no longer |
6020 | * need req so release it. | ||
6021 | */ | ||
6022 | if (req) { | ||
6023 | reqsk_fastopen_remove(sk, req, false); | ||
6024 | } else { | ||
6025 | /* Make sure socket is routed, for | ||
6026 | * correct metrics. | ||
6027 | */ | ||
6028 | icsk->icsk_af_ops->rebuild_header(sk); | ||
6029 | tcp_init_congestion_control(sk); | ||
6030 | |||
6031 | tcp_mtup_init(sk); | ||
6032 | tcp_init_buffer_space(sk); | ||
6033 | tp->copied_seq = tp->rcv_nxt; | ||
6034 | } | ||
6005 | smp_mb(); | 6035 | smp_mb(); |
6006 | tcp_set_state(sk, TCP_ESTABLISHED); | 6036 | tcp_set_state(sk, TCP_ESTABLISHED); |
6007 | sk->sk_state_change(sk); | 6037 | sk->sk_state_change(sk); |
@@ -6023,23 +6053,27 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
6023 | if (tp->rx_opt.tstamp_ok) | 6053 | if (tp->rx_opt.tstamp_ok) |
6024 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; | 6054 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; |
6025 | 6055 | ||
6026 | /* Make sure socket is routed, for | 6056 | if (req) { |
6027 | * correct metrics. | 6057 | /* Re-arm the timer because data may |
6028 | */ | 6058 | * have been sent out. This is similar |
6029 | icsk->icsk_af_ops->rebuild_header(sk); | 6059 | * to the regular data transmission case |
6030 | 6060 | * when new data has just been ack'ed. | |
6031 | tcp_init_metrics(sk); | 6061 | * |
6032 | 6062 | * (TFO) - we could try to be more | |
6033 | tcp_init_congestion_control(sk); | 6063 | * aggressive and retranmitting any data |
6064 | * sooner based on when they were sent | ||
6065 | * out. | ||
6066 | */ | ||
6067 | tcp_rearm_rto(sk); | ||
6068 | } else | ||
6069 | tcp_init_metrics(sk); | ||
6034 | 6070 | ||
6035 | /* Prevent spurious tcp_cwnd_restart() on | 6071 | /* Prevent spurious tcp_cwnd_restart() on |
6036 | * first data packet. | 6072 | * first data packet. |
6037 | */ | 6073 | */ |
6038 | tp->lsndtime = tcp_time_stamp; | 6074 | tp->lsndtime = tcp_time_stamp; |
6039 | 6075 | ||
6040 | tcp_mtup_init(sk); | ||
6041 | tcp_initialize_rcv_mss(sk); | 6076 | tcp_initialize_rcv_mss(sk); |
6042 | tcp_init_buffer_space(sk); | ||
6043 | tcp_fast_path_on(tp); | 6077 | tcp_fast_path_on(tp); |
6044 | } else { | 6078 | } else { |
6045 | return 1; | 6079 | return 1; |
@@ -6047,6 +6081,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
6047 | break; | 6081 | break; |
6048 | 6082 | ||
6049 | case TCP_FIN_WAIT1: | 6083 | case TCP_FIN_WAIT1: |
6084 | /* If we enter the TCP_FIN_WAIT1 state and we are a | ||
6085 | * Fast Open socket and this is the first acceptable | ||
6086 | * ACK we have received, this would have acknowledged | ||
6087 | * our SYNACK so stop the SYNACK timer. | ||
6088 | */ | ||
6089 | if (acceptable && req != NULL) { | ||
6090 | /* We no longer need the request sock. */ | ||
6091 | reqsk_fastopen_remove(sk, req, false); | ||
6092 | tcp_rearm_rto(sk); | ||
6093 | } | ||
6050 | if (tp->snd_una == tp->write_seq) { | 6094 | if (tp->snd_una == tp->write_seq) { |
6051 | struct dst_entry *dst; | 6095 | struct dst_entry *dst; |
6052 | 6096 | ||
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1e15c5be04e7..e64abed249cc 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -352,6 +352,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) | |||
352 | const int code = icmp_hdr(icmp_skb)->code; | 352 | const int code = icmp_hdr(icmp_skb)->code; |
353 | struct sock *sk; | 353 | struct sock *sk; |
354 | struct sk_buff *skb; | 354 | struct sk_buff *skb; |
355 | struct request_sock *req; | ||
355 | __u32 seq; | 356 | __u32 seq; |
356 | __u32 remaining; | 357 | __u32 remaining; |
357 | int err; | 358 | int err; |
@@ -394,9 +395,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) | |||
394 | 395 | ||
395 | icsk = inet_csk(sk); | 396 | icsk = inet_csk(sk); |
396 | tp = tcp_sk(sk); | 397 | tp = tcp_sk(sk); |
398 | req = tp->fastopen_rsk; | ||
397 | seq = ntohl(th->seq); | 399 | seq = ntohl(th->seq); |
398 | if (sk->sk_state != TCP_LISTEN && | 400 | if (sk->sk_state != TCP_LISTEN && |
399 | !between(seq, tp->snd_una, tp->snd_nxt)) { | 401 | !between(seq, tp->snd_una, tp->snd_nxt) && |
402 | (req == NULL || seq != tcp_rsk(req)->snt_isn)) { | ||
403 | /* For a Fast Open socket, allow seq to be snt_isn. */ | ||
400 | NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); | 404 | NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); |
401 | goto out; | 405 | goto out; |
402 | } | 406 | } |
@@ -435,6 +439,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) | |||
435 | !icsk->icsk_backoff) | 439 | !icsk->icsk_backoff) |
436 | break; | 440 | break; |
437 | 441 | ||
442 | /* XXX (TFO) - revisit the following logic for TFO */ | ||
443 | |||
438 | if (sock_owned_by_user(sk)) | 444 | if (sock_owned_by_user(sk)) |
439 | break; | 445 | break; |
440 | 446 | ||
@@ -466,6 +472,14 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) | |||
466 | goto out; | 472 | goto out; |
467 | } | 473 | } |
468 | 474 | ||
475 | /* XXX (TFO) - if it's a TFO socket and has been accepted, rather | ||
476 | * than following the TCP_SYN_RECV case and closing the socket, | ||
477 | * we ignore the ICMP error and keep trying like a fully established | ||
478 | * socket. Is this the right thing to do? | ||
479 | */ | ||
480 | if (req && req->sk == NULL) | ||
481 | goto out; | ||
482 | |||
469 | switch (sk->sk_state) { | 483 | switch (sk->sk_state) { |
470 | struct request_sock *req, **prev; | 484 | struct request_sock *req, **prev; |
471 | case TCP_LISTEN: | 485 | case TCP_LISTEN: |
@@ -498,7 +512,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) | |||
498 | 512 | ||
499 | case TCP_SYN_SENT: | 513 | case TCP_SYN_SENT: |
500 | case TCP_SYN_RECV: /* Cannot happen. | 514 | case TCP_SYN_RECV: /* Cannot happen. |
501 | It can f.e. if SYNs crossed. | 515 | It can f.e. if SYNs crossed, |
516 | or Fast Open. | ||
502 | */ | 517 | */ |
503 | if (!sock_owned_by_user(sk)) { | 518 | if (!sock_owned_by_user(sk)) { |
504 | sk->sk_err = err; | 519 | sk->sk_err = err; |
@@ -809,8 +824,12 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) | |||
809 | static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, | 824 | static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, |
810 | struct request_sock *req) | 825 | struct request_sock *req) |
811 | { | 826 | { |
812 | tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, | 827 | /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV |
813 | tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, | 828 | * sk->sk_state == TCP_SYN_RECV -> for Fast Open. |
829 | */ | ||
830 | tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ? | ||
831 | tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, | ||
832 | tcp_rsk(req)->rcv_nxt, req->rcv_wnd, | ||
814 | req->ts_recent, | 833 | req->ts_recent, |
815 | 0, | 834 | 0, |
816 | tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, | 835 | tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, |
@@ -839,7 +858,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, | |||
839 | if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) | 858 | if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) |
840 | return -1; | 859 | return -1; |
841 | 860 | ||
842 | skb = tcp_make_synack(sk, dst, req, rvp); | 861 | skb = tcp_make_synack(sk, dst, req, rvp, NULL); |
843 | 862 | ||
844 | if (skb) { | 863 | if (skb) { |
845 | __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr); | 864 | __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr); |
@@ -1272,6 +1291,178 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { | |||
1272 | }; | 1291 | }; |
1273 | #endif | 1292 | #endif |
1274 | 1293 | ||
1294 | static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb, | ||
1295 | struct request_sock *req, | ||
1296 | struct tcp_fastopen_cookie *foc, | ||
1297 | struct tcp_fastopen_cookie *valid_foc) | ||
1298 | { | ||
1299 | bool skip_cookie = false; | ||
1300 | struct fastopen_queue *fastopenq; | ||
1301 | |||
1302 | if (likely(!fastopen_cookie_present(foc))) { | ||
1303 | /* See include/net/tcp.h for the meaning of these knobs */ | ||
1304 | if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) || | ||
1305 | ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) && | ||
1306 | (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1))) | ||
1307 | skip_cookie = true; /* no cookie to validate */ | ||
1308 | else | ||
1309 | return false; | ||
1310 | } | ||
1311 | fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq; | ||
1312 | /* A FO option is present; bump the counter. */ | ||
1313 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE); | ||
1314 | |||
1315 | /* Make sure the listener has enabled fastopen, and we don't | ||
1316 | * exceed the max # of pending TFO requests allowed before trying | ||
1317 | * to validating the cookie in order to avoid burning CPU cycles | ||
1318 | * unnecessarily. | ||
1319 | * | ||
1320 | * XXX (TFO) - The implication of checking the max_qlen before | ||
1321 | * processing a cookie request is that clients can't differentiate | ||
1322 | * between qlen overflow causing Fast Open to be disabled | ||
1323 | * temporarily vs a server not supporting Fast Open at all. | ||
1324 | */ | ||
1325 | if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 || | ||
1326 | fastopenq == NULL || fastopenq->max_qlen == 0) | ||
1327 | return false; | ||
1328 | |||
1329 | if (fastopenq->qlen >= fastopenq->max_qlen) { | ||
1330 | struct request_sock *req1; | ||
1331 | spin_lock(&fastopenq->lock); | ||
1332 | req1 = fastopenq->rskq_rst_head; | ||
1333 | if ((req1 == NULL) || time_after(req1->expires, jiffies)) { | ||
1334 | spin_unlock(&fastopenq->lock); | ||
1335 | NET_INC_STATS_BH(sock_net(sk), | ||
1336 | LINUX_MIB_TCPFASTOPENLISTENOVERFLOW); | ||
1337 | /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/ | ||
1338 | foc->len = -1; | ||
1339 | return false; | ||
1340 | } | ||
1341 | fastopenq->rskq_rst_head = req1->dl_next; | ||
1342 | fastopenq->qlen--; | ||
1343 | spin_unlock(&fastopenq->lock); | ||
1344 | reqsk_free(req1); | ||
1345 | } | ||
1346 | if (skip_cookie) { | ||
1347 | tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; | ||
1348 | return true; | ||
1349 | } | ||
1350 | if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) { | ||
1351 | if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) { | ||
1352 | tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc); | ||
1353 | if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) || | ||
1354 | memcmp(&foc->val[0], &valid_foc->val[0], | ||
1355 | TCP_FASTOPEN_COOKIE_SIZE) != 0) | ||
1356 | return false; | ||
1357 | valid_foc->len = -1; | ||
1358 | } | ||
1359 | /* Acknowledge the data received from the peer. */ | ||
1360 | tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq; | ||
1361 | return true; | ||
1362 | } else if (foc->len == 0) { /* Client requesting a cookie */ | ||
1363 | tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc); | ||
1364 | NET_INC_STATS_BH(sock_net(sk), | ||
1365 | LINUX_MIB_TCPFASTOPENCOOKIEREQD); | ||
1366 | } else { | ||
1367 | /* Client sent a cookie with wrong size. Treat it | ||
1368 | * the same as invalid and return a valid one. | ||
1369 | */ | ||
1370 | tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc); | ||
1371 | } | ||
1372 | return false; | ||
1373 | } | ||
1374 | |||
1375 | static int tcp_v4_conn_req_fastopen(struct sock *sk, | ||
1376 | struct sk_buff *skb, | ||
1377 | struct sk_buff *skb_synack, | ||
1378 | struct request_sock *req, | ||
1379 | struct request_values *rvp) | ||
1380 | { | ||
1381 | struct tcp_sock *tp = tcp_sk(sk); | ||
1382 | struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; | ||
1383 | const struct inet_request_sock *ireq = inet_rsk(req); | ||
1384 | struct sock *child; | ||
1385 | |||
1386 | req->retrans = 0; | ||
1387 | req->sk = NULL; | ||
1388 | |||
1389 | child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); | ||
1390 | if (child == NULL) { | ||
1391 | NET_INC_STATS_BH(sock_net(sk), | ||
1392 | LINUX_MIB_TCPFASTOPENPASSIVEFAIL); | ||
1393 | kfree_skb(skb_synack); | ||
1394 | return -1; | ||
1395 | } | ||
1396 | ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr, | ||
1397 | ireq->rmt_addr, ireq->opt); | ||
1398 | /* XXX (TFO) - is it ok to ignore error and continue? */ | ||
1399 | |||
1400 | spin_lock(&queue->fastopenq->lock); | ||
1401 | queue->fastopenq->qlen++; | ||
1402 | spin_unlock(&queue->fastopenq->lock); | ||
1403 | |||
1404 | /* Initialize the child socket. Have to fix some values to take | ||
1405 | * into account the child is a Fast Open socket and is created | ||
1406 | * only out of the bits carried in the SYN packet. | ||
1407 | */ | ||
1408 | tp = tcp_sk(child); | ||
1409 | |||
1410 | tp->fastopen_rsk = req; | ||
1411 | /* Do a hold on the listner sk so that if the listener is being | ||
1412 | * closed, the child that has been accepted can live on and still | ||
1413 | * access listen_lock. | ||
1414 | */ | ||
1415 | sock_hold(sk); | ||
1416 | tcp_rsk(req)->listener = sk; | ||
1417 | |||
1418 | /* RFC1323: The window in SYN & SYN/ACK segments is never | ||
1419 | * scaled. So correct it appropriately. | ||
1420 | */ | ||
1421 | tp->snd_wnd = ntohs(tcp_hdr(skb)->window); | ||
1422 | |||
1423 | /* Activate the retrans timer so that SYNACK can be retransmitted. | ||
1424 | * The request socket is not added to the SYN table of the parent | ||
1425 | * because it's been added to the accept queue directly. | ||
1426 | */ | ||
1427 | inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, | ||
1428 | TCP_TIMEOUT_INIT, TCP_RTO_MAX); | ||
1429 | |||
1430 | /* Add the child socket directly into the accept queue */ | ||
1431 | inet_csk_reqsk_queue_add(sk, req, child); | ||
1432 | |||
1433 | /* Now finish processing the fastopen child socket. */ | ||
1434 | inet_csk(child)->icsk_af_ops->rebuild_header(child); | ||
1435 | tcp_init_congestion_control(child); | ||
1436 | tcp_mtup_init(child); | ||
1437 | tcp_init_buffer_space(child); | ||
1438 | tcp_init_metrics(child); | ||
1439 | |||
1440 | /* Queue the data carried in the SYN packet. We need to first | ||
1441 | * bump skb's refcnt because the caller will attempt to free it. | ||
1442 | * | ||
1443 | * XXX (TFO) - we honor a zero-payload TFO request for now. | ||
1444 | * (Any reason not to?) | ||
1445 | */ | ||
1446 | if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) { | ||
1447 | /* Don't queue the skb if there is no payload in SYN. | ||
1448 | * XXX (TFO) - How about SYN+FIN? | ||
1449 | */ | ||
1450 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; | ||
1451 | } else { | ||
1452 | skb = skb_get(skb); | ||
1453 | skb_dst_drop(skb); | ||
1454 | __skb_pull(skb, tcp_hdr(skb)->doff * 4); | ||
1455 | skb_set_owner_r(skb, child); | ||
1456 | __skb_queue_tail(&child->sk_receive_queue, skb); | ||
1457 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; | ||
1458 | } | ||
1459 | sk->sk_data_ready(sk, 0); | ||
1460 | bh_unlock_sock(child); | ||
1461 | sock_put(child); | ||
1462 | WARN_ON(req->sk == NULL); | ||
1463 | return 0; | ||
1464 | } | ||
1465 | |||
1275 | int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | 1466 | int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) |
1276 | { | 1467 | { |
1277 | struct tcp_extend_values tmp_ext; | 1468 | struct tcp_extend_values tmp_ext; |
@@ -1285,6 +1476,11 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
1285 | __be32 daddr = ip_hdr(skb)->daddr; | 1476 | __be32 daddr = ip_hdr(skb)->daddr; |
1286 | __u32 isn = TCP_SKB_CB(skb)->when; | 1477 | __u32 isn = TCP_SKB_CB(skb)->when; |
1287 | bool want_cookie = false; | 1478 | bool want_cookie = false; |
1479 | struct flowi4 fl4; | ||
1480 | struct tcp_fastopen_cookie foc = { .len = -1 }; | ||
1481 | struct tcp_fastopen_cookie valid_foc = { .len = -1 }; | ||
1482 | struct sk_buff *skb_synack; | ||
1483 | int do_fastopen; | ||
1288 | 1484 | ||
1289 | /* Never answer to SYNs send to broadcast or multicast */ | 1485 | /* Never answer to SYNs send to broadcast or multicast */ |
1290 | if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) | 1486 | if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) |
@@ -1319,7 +1515,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
1319 | tcp_clear_options(&tmp_opt); | 1515 | tcp_clear_options(&tmp_opt); |
1320 | tmp_opt.mss_clamp = TCP_MSS_DEFAULT; | 1516 | tmp_opt.mss_clamp = TCP_MSS_DEFAULT; |
1321 | tmp_opt.user_mss = tp->rx_opt.user_mss; | 1517 | tmp_opt.user_mss = tp->rx_opt.user_mss; |
1322 | tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); | 1518 | tcp_parse_options(skb, &tmp_opt, &hash_location, 0, |
1519 | want_cookie ? NULL : &foc); | ||
1323 | 1520 | ||
1324 | if (tmp_opt.cookie_plus > 0 && | 1521 | if (tmp_opt.cookie_plus > 0 && |
1325 | tmp_opt.saw_tstamp && | 1522 | tmp_opt.saw_tstamp && |
@@ -1377,8 +1574,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
1377 | isn = cookie_v4_init_sequence(sk, skb, &req->mss); | 1574 | isn = cookie_v4_init_sequence(sk, skb, &req->mss); |
1378 | req->cookie_ts = tmp_opt.tstamp_ok; | 1575 | req->cookie_ts = tmp_opt.tstamp_ok; |
1379 | } else if (!isn) { | 1576 | } else if (!isn) { |
1380 | struct flowi4 fl4; | ||
1381 | |||
1382 | /* VJ's idea. We save last timestamp seen | 1577 | /* VJ's idea. We save last timestamp seen |
1383 | * from the destination in peer table, when entering | 1578 | * from the destination in peer table, when entering |
1384 | * state TIME-WAIT, and check against it before | 1579 | * state TIME-WAIT, and check against it before |
@@ -1419,14 +1614,52 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
1419 | tcp_rsk(req)->snt_isn = isn; | 1614 | tcp_rsk(req)->snt_isn = isn; |
1420 | tcp_rsk(req)->snt_synack = tcp_time_stamp; | 1615 | tcp_rsk(req)->snt_synack = tcp_time_stamp; |
1421 | 1616 | ||
1422 | if (tcp_v4_send_synack(sk, dst, req, | 1617 | if (dst == NULL) { |
1423 | (struct request_values *)&tmp_ext, | 1618 | dst = inet_csk_route_req(sk, &fl4, req); |
1424 | skb_get_queue_mapping(skb), | 1619 | if (dst == NULL) |
1425 | want_cookie) || | 1620 | goto drop_and_free; |
1426 | want_cookie) | 1621 | } |
1622 | do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc); | ||
1623 | |||
1624 | /* We don't call tcp_v4_send_synack() directly because we need | ||
1625 | * to make sure a child socket can be created successfully before | ||
1626 | * sending back synack! | ||
1627 | * | ||
1628 | * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack() | ||
1629 | * (or better yet, call tcp_send_synack() in the child context | ||
1630 | * directly, but will have to fix bunch of other code first) | ||
1631 | * after syn_recv_sock() except one will need to first fix the | ||
1632 | * latter to remove its dependency on the current implementation | ||
1633 | * of tcp_v4_send_synack()->tcp_select_initial_window(). | ||
1634 | */ | ||
1635 | skb_synack = tcp_make_synack(sk, dst, req, | ||
1636 | (struct request_values *)&tmp_ext, | ||
1637 | fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL); | ||
1638 | |||
1639 | if (skb_synack) { | ||
1640 | __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr); | ||
1641 | skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb)); | ||
1642 | } else | ||
1643 | goto drop_and_free; | ||
1644 | |||
1645 | if (likely(!do_fastopen)) { | ||
1646 | int err; | ||
1647 | err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr, | ||
1648 | ireq->rmt_addr, ireq->opt); | ||
1649 | err = net_xmit_eval(err); | ||
1650 | if (err || want_cookie) | ||
1651 | goto drop_and_free; | ||
1652 | |||
1653 | tcp_rsk(req)->listener = NULL; | ||
1654 | /* Add the request_sock to the SYN table */ | ||
1655 | inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); | ||
1656 | if (fastopen_cookie_present(&foc) && foc.len != 0) | ||
1657 | NET_INC_STATS_BH(sock_net(sk), | ||
1658 | LINUX_MIB_TCPFASTOPENPASSIVEFAIL); | ||
1659 | } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req, | ||
1660 | (struct request_values *)&tmp_ext)) | ||
1427 | goto drop_and_free; | 1661 | goto drop_and_free; |
1428 | 1662 | ||
1429 | inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); | ||
1430 | return 0; | 1663 | return 0; |
1431 | 1664 | ||
1432 | drop_and_release: | 1665 | drop_and_release: |
@@ -1554,7 +1787,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) | |||
1554 | struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, | 1787 | struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, |
1555 | iph->saddr, iph->daddr); | 1788 | iph->saddr, iph->daddr); |
1556 | if (req) | 1789 | if (req) |
1557 | return tcp_check_req(sk, skb, req, prev); | 1790 | return tcp_check_req(sk, skb, req, prev, false); |
1558 | 1791 | ||
1559 | nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, | 1792 | nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, |
1560 | th->source, iph->daddr, th->dest, inet_iif(skb)); | 1793 | th->source, iph->daddr, th->dest, inet_iif(skb)); |
@@ -1977,6 +2210,7 @@ void tcp_v4_destroy_sock(struct sock *sk) | |||
1977 | tcp_cookie_values_release); | 2210 | tcp_cookie_values_release); |
1978 | tp->cookie_values = NULL; | 2211 | tp->cookie_values = NULL; |
1979 | } | 2212 | } |
2213 | BUG_ON(tp->fastopen_rsk != NULL); | ||
1980 | 2214 | ||
1981 | /* If socket is aborted during connect operation */ | 2215 | /* If socket is aborted during connect operation */ |
1982 | tcp_free_fastopen_req(tp); | 2216 | tcp_free_fastopen_req(tp); |
@@ -2393,7 +2627,7 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) | |||
2393 | EXPORT_SYMBOL(tcp_proc_unregister); | 2627 | EXPORT_SYMBOL(tcp_proc_unregister); |
2394 | 2628 | ||
2395 | static void get_openreq4(const struct sock *sk, const struct request_sock *req, | 2629 | static void get_openreq4(const struct sock *sk, const struct request_sock *req, |
2396 | struct seq_file *f, int i, int uid, int *len) | 2630 | struct seq_file *f, int i, kuid_t uid, int *len) |
2397 | { | 2631 | { |
2398 | const struct inet_request_sock *ireq = inet_rsk(req); | 2632 | const struct inet_request_sock *ireq = inet_rsk(req); |
2399 | long delta = req->expires - jiffies; | 2633 | long delta = req->expires - jiffies; |
@@ -2410,7 +2644,7 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req, | |||
2410 | 1, /* timers active (only the expire timer) */ | 2644 | 1, /* timers active (only the expire timer) */ |
2411 | jiffies_delta_to_clock_t(delta), | 2645 | jiffies_delta_to_clock_t(delta), |
2412 | req->retrans, | 2646 | req->retrans, |
2413 | uid, | 2647 | from_kuid_munged(seq_user_ns(f), uid), |
2414 | 0, /* non standard timer */ | 2648 | 0, /* non standard timer */ |
2415 | 0, /* open_requests have no inode */ | 2649 | 0, /* open_requests have no inode */ |
2416 | atomic_read(&sk->sk_refcnt), | 2650 | atomic_read(&sk->sk_refcnt), |
@@ -2425,6 +2659,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) | |||
2425 | const struct tcp_sock *tp = tcp_sk(sk); | 2659 | const struct tcp_sock *tp = tcp_sk(sk); |
2426 | const struct inet_connection_sock *icsk = inet_csk(sk); | 2660 | const struct inet_connection_sock *icsk = inet_csk(sk); |
2427 | const struct inet_sock *inet = inet_sk(sk); | 2661 | const struct inet_sock *inet = inet_sk(sk); |
2662 | struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq; | ||
2428 | __be32 dest = inet->inet_daddr; | 2663 | __be32 dest = inet->inet_daddr; |
2429 | __be32 src = inet->inet_rcv_saddr; | 2664 | __be32 src = inet->inet_rcv_saddr; |
2430 | __u16 destp = ntohs(inet->inet_dport); | 2665 | __u16 destp = ntohs(inet->inet_dport); |
@@ -2461,7 +2696,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) | |||
2461 | timer_active, | 2696 | timer_active, |
2462 | jiffies_delta_to_clock_t(timer_expires - jiffies), | 2697 | jiffies_delta_to_clock_t(timer_expires - jiffies), |
2463 | icsk->icsk_retransmits, | 2698 | icsk->icsk_retransmits, |
2464 | sock_i_uid(sk), | 2699 | from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), |
2465 | icsk->icsk_probes_out, | 2700 | icsk->icsk_probes_out, |
2466 | sock_i_ino(sk), | 2701 | sock_i_ino(sk), |
2467 | atomic_read(&sk->sk_refcnt), sk, | 2702 | atomic_read(&sk->sk_refcnt), sk, |
@@ -2469,7 +2704,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) | |||
2469 | jiffies_to_clock_t(icsk->icsk_ack.ato), | 2704 | jiffies_to_clock_t(icsk->icsk_ack.ato), |
2470 | (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, | 2705 | (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, |
2471 | tp->snd_cwnd, | 2706 | tp->snd_cwnd, |
2472 | tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh, | 2707 | sk->sk_state == TCP_LISTEN ? |
2708 | (fastopenq ? fastopenq->max_qlen : 0) : | ||
2709 | (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh), | ||
2473 | len); | 2710 | len); |
2474 | } | 2711 | } |
2475 | 2712 | ||
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 6ff7f10dce9d..e965319d610b 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c | |||
@@ -507,6 +507,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
507 | newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; | 507 | newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; |
508 | newtp->rx_opt.mss_clamp = req->mss; | 508 | newtp->rx_opt.mss_clamp = req->mss; |
509 | TCP_ECN_openreq_child(newtp, req); | 509 | TCP_ECN_openreq_child(newtp, req); |
510 | newtp->fastopen_rsk = NULL; | ||
510 | 511 | ||
511 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); | 512 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); |
512 | } | 513 | } |
@@ -515,13 +516,18 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
515 | EXPORT_SYMBOL(tcp_create_openreq_child); | 516 | EXPORT_SYMBOL(tcp_create_openreq_child); |
516 | 517 | ||
517 | /* | 518 | /* |
518 | * Process an incoming packet for SYN_RECV sockets represented | 519 | * Process an incoming packet for SYN_RECV sockets represented as a |
519 | * as a request_sock. | 520 | * request_sock. Normally sk is the listener socket but for TFO it |
521 | * points to the child socket. | ||
522 | * | ||
523 | * XXX (TFO) - The current impl contains a special check for ack | ||
524 | * validation and inside tcp_v4_reqsk_send_ack(). Can we do better? | ||
520 | */ | 525 | */ |
521 | 526 | ||
522 | struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | 527 | struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, |
523 | struct request_sock *req, | 528 | struct request_sock *req, |
524 | struct request_sock **prev) | 529 | struct request_sock **prev, |
530 | bool fastopen) | ||
525 | { | 531 | { |
526 | struct tcp_options_received tmp_opt; | 532 | struct tcp_options_received tmp_opt; |
527 | const u8 *hash_location; | 533 | const u8 *hash_location; |
@@ -530,6 +536,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | |||
530 | __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); | 536 | __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); |
531 | bool paws_reject = false; | 537 | bool paws_reject = false; |
532 | 538 | ||
539 | BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN)); | ||
540 | |||
533 | tmp_opt.saw_tstamp = 0; | 541 | tmp_opt.saw_tstamp = 0; |
534 | if (th->doff > (sizeof(struct tcphdr)>>2)) { | 542 | if (th->doff > (sizeof(struct tcphdr)>>2)) { |
535 | tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); | 543 | tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); |
@@ -565,6 +573,9 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | |||
565 | * | 573 | * |
566 | * Enforce "SYN-ACK" according to figure 8, figure 6 | 574 | * Enforce "SYN-ACK" according to figure 8, figure 6 |
567 | * of RFC793, fixed by RFC1122. | 575 | * of RFC793, fixed by RFC1122. |
576 | * | ||
577 | * Note that even if there is new data in the SYN packet | ||
578 | * they will be thrown away too. | ||
568 | */ | 579 | */ |
569 | req->rsk_ops->rtx_syn_ack(sk, req, NULL); | 580 | req->rsk_ops->rtx_syn_ack(sk, req, NULL); |
570 | return NULL; | 581 | return NULL; |
@@ -622,9 +633,12 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | |||
622 | * sent (the segment carries an unacceptable ACK) ... | 633 | * sent (the segment carries an unacceptable ACK) ... |
623 | * a reset is sent." | 634 | * a reset is sent." |
624 | * | 635 | * |
625 | * Invalid ACK: reset will be sent by listening socket | 636 | * Invalid ACK: reset will be sent by listening socket. |
637 | * Note that the ACK validity check for a Fast Open socket is done | ||
638 | * elsewhere and is checked directly against the child socket rather | ||
639 | * than req because user data may have been sent out. | ||
626 | */ | 640 | */ |
627 | if ((flg & TCP_FLAG_ACK) && | 641 | if ((flg & TCP_FLAG_ACK) && !fastopen && |
628 | (TCP_SKB_CB(skb)->ack_seq != | 642 | (TCP_SKB_CB(skb)->ack_seq != |
629 | tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk)))) | 643 | tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk)))) |
630 | return sk; | 644 | return sk; |
@@ -637,7 +651,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | |||
637 | /* RFC793: "first check sequence number". */ | 651 | /* RFC793: "first check sequence number". */ |
638 | 652 | ||
639 | if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, | 653 | if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, |
640 | tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) { | 654 | tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) { |
641 | /* Out of window: send ACK and drop. */ | 655 | /* Out of window: send ACK and drop. */ |
642 | if (!(flg & TCP_FLAG_RST)) | 656 | if (!(flg & TCP_FLAG_RST)) |
643 | req->rsk_ops->send_ack(sk, skb, req); | 657 | req->rsk_ops->send_ack(sk, skb, req); |
@@ -648,7 +662,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | |||
648 | 662 | ||
649 | /* In sequence, PAWS is OK. */ | 663 | /* In sequence, PAWS is OK. */ |
650 | 664 | ||
651 | if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1)) | 665 | if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) |
652 | req->ts_recent = tmp_opt.rcv_tsval; | 666 | req->ts_recent = tmp_opt.rcv_tsval; |
653 | 667 | ||
654 | if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { | 668 | if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { |
@@ -667,10 +681,19 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, | |||
667 | 681 | ||
668 | /* ACK sequence verified above, just make sure ACK is | 682 | /* ACK sequence verified above, just make sure ACK is |
669 | * set. If ACK not set, just silently drop the packet. | 683 | * set. If ACK not set, just silently drop the packet. |
684 | * | ||
685 | * XXX (TFO) - if we ever allow "data after SYN", the | ||
686 | * following check needs to be removed. | ||
670 | */ | 687 | */ |
671 | if (!(flg & TCP_FLAG_ACK)) | 688 | if (!(flg & TCP_FLAG_ACK)) |
672 | return NULL; | 689 | return NULL; |
673 | 690 | ||
691 | /* For Fast Open no more processing is needed (sk is the | ||
692 | * child socket). | ||
693 | */ | ||
694 | if (fastopen) | ||
695 | return sk; | ||
696 | |||
674 | /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ | 697 | /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ |
675 | if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && | 698 | if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && |
676 | TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { | 699 | TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { |
@@ -706,11 +729,21 @@ listen_overflow: | |||
706 | } | 729 | } |
707 | 730 | ||
708 | embryonic_reset: | 731 | embryonic_reset: |
709 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); | 732 | if (!(flg & TCP_FLAG_RST)) { |
710 | if (!(flg & TCP_FLAG_RST)) | 733 | /* Received a bad SYN pkt - for TFO We try not to reset |
734 | * the local connection unless it's really necessary to | ||
735 | * avoid becoming vulnerable to outside attack aiming at | ||
736 | * resetting legit local connections. | ||
737 | */ | ||
711 | req->rsk_ops->send_reset(sk, skb); | 738 | req->rsk_ops->send_reset(sk, skb); |
712 | 739 | } else if (fastopen) { /* received a valid RST pkt */ | |
713 | inet_csk_reqsk_queue_drop(sk, req, prev); | 740 | reqsk_fastopen_remove(sk, req, true); |
741 | tcp_reset(sk); | ||
742 | } | ||
743 | if (!fastopen) { | ||
744 | inet_csk_reqsk_queue_drop(sk, req, prev); | ||
745 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); | ||
746 | } | ||
714 | return NULL; | 747 | return NULL; |
715 | } | 748 | } |
716 | EXPORT_SYMBOL(tcp_check_req); | 749 | EXPORT_SYMBOL(tcp_check_req); |
@@ -719,6 +752,12 @@ EXPORT_SYMBOL(tcp_check_req); | |||
719 | * Queue segment on the new socket if the new socket is active, | 752 | * Queue segment on the new socket if the new socket is active, |
720 | * otherwise we just shortcircuit this and continue with | 753 | * otherwise we just shortcircuit this and continue with |
721 | * the new socket. | 754 | * the new socket. |
755 | * | ||
756 | * For the vast majority of cases child->sk_state will be TCP_SYN_RECV | ||
757 | * when entering. But other states are possible due to a race condition | ||
758 | * where after __inet_lookup_established() fails but before the listener | ||
759 | * locked is obtained, other packets cause the same connection to | ||
760 | * be created. | ||
722 | */ | 761 | */ |
723 | 762 | ||
724 | int tcp_child_process(struct sock *parent, struct sock *child, | 763 | int tcp_child_process(struct sock *parent, struct sock *child, |
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d04632673a9e..9383b51f3efc 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -702,7 +702,8 @@ static unsigned int tcp_synack_options(struct sock *sk, | |||
702 | unsigned int mss, struct sk_buff *skb, | 702 | unsigned int mss, struct sk_buff *skb, |
703 | struct tcp_out_options *opts, | 703 | struct tcp_out_options *opts, |
704 | struct tcp_md5sig_key **md5, | 704 | struct tcp_md5sig_key **md5, |
705 | struct tcp_extend_values *xvp) | 705 | struct tcp_extend_values *xvp, |
706 | struct tcp_fastopen_cookie *foc) | ||
706 | { | 707 | { |
707 | struct inet_request_sock *ireq = inet_rsk(req); | 708 | struct inet_request_sock *ireq = inet_rsk(req); |
708 | unsigned int remaining = MAX_TCP_OPTION_SPACE; | 709 | unsigned int remaining = MAX_TCP_OPTION_SPACE; |
@@ -747,7 +748,15 @@ static unsigned int tcp_synack_options(struct sock *sk, | |||
747 | if (unlikely(!ireq->tstamp_ok)) | 748 | if (unlikely(!ireq->tstamp_ok)) |
748 | remaining -= TCPOLEN_SACKPERM_ALIGNED; | 749 | remaining -= TCPOLEN_SACKPERM_ALIGNED; |
749 | } | 750 | } |
750 | 751 | if (foc != NULL) { | |
752 | u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; | ||
753 | need = (need + 3) & ~3U; /* Align to 32 bits */ | ||
754 | if (remaining >= need) { | ||
755 | opts->options |= OPTION_FAST_OPEN_COOKIE; | ||
756 | opts->fastopen_cookie = foc; | ||
757 | remaining -= need; | ||
758 | } | ||
759 | } | ||
751 | /* Similar rationale to tcp_syn_options() applies here, too. | 760 | /* Similar rationale to tcp_syn_options() applies here, too. |
752 | * If the <SYN> options fit, the same options should fit now! | 761 | * If the <SYN> options fit, the same options should fit now! |
753 | */ | 762 | */ |
@@ -2658,7 +2667,8 @@ int tcp_send_synack(struct sock *sk) | |||
2658 | */ | 2667 | */ |
2659 | struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | 2668 | struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, |
2660 | struct request_sock *req, | 2669 | struct request_sock *req, |
2661 | struct request_values *rvp) | 2670 | struct request_values *rvp, |
2671 | struct tcp_fastopen_cookie *foc) | ||
2662 | { | 2672 | { |
2663 | struct tcp_out_options opts; | 2673 | struct tcp_out_options opts; |
2664 | struct tcp_extend_values *xvp = tcp_xv(rvp); | 2674 | struct tcp_extend_values *xvp = tcp_xv(rvp); |
@@ -2718,7 +2728,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
2718 | #endif | 2728 | #endif |
2719 | TCP_SKB_CB(skb)->when = tcp_time_stamp; | 2729 | TCP_SKB_CB(skb)->when = tcp_time_stamp; |
2720 | tcp_header_size = tcp_synack_options(sk, req, mss, | 2730 | tcp_header_size = tcp_synack_options(sk, req, mss, |
2721 | skb, &opts, &md5, xvp) | 2731 | skb, &opts, &md5, xvp, foc) |
2722 | + sizeof(*th); | 2732 | + sizeof(*th); |
2723 | 2733 | ||
2724 | skb_push(skb, tcp_header_size); | 2734 | skb_push(skb, tcp_header_size); |
@@ -2772,7 +2782,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
2772 | } | 2782 | } |
2773 | 2783 | ||
2774 | th->seq = htonl(TCP_SKB_CB(skb)->seq); | 2784 | th->seq = htonl(TCP_SKB_CB(skb)->seq); |
2775 | th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); | 2785 | /* XXX data is queued and acked as is. No buffer/window check */ |
2786 | th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt); | ||
2776 | 2787 | ||
2777 | /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ | 2788 | /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ |
2778 | th->window = htons(min(req->rcv_wnd, 65535U)); | 2789 | th->window = htons(min(req->rcv_wnd, 65535U)); |
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index b774a03bd1dc..fc04711e80c8 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c | |||
@@ -305,6 +305,35 @@ static void tcp_probe_timer(struct sock *sk) | |||
305 | } | 305 | } |
306 | 306 | ||
307 | /* | 307 | /* |
308 | * Timer for Fast Open socket to retransmit SYNACK. Note that the | ||
309 | * sk here is the child socket, not the parent (listener) socket. | ||
310 | */ | ||
311 | static void tcp_fastopen_synack_timer(struct sock *sk) | ||
312 | { | ||
313 | struct inet_connection_sock *icsk = inet_csk(sk); | ||
314 | int max_retries = icsk->icsk_syn_retries ? : | ||
315 | sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */ | ||
316 | struct request_sock *req; | ||
317 | |||
318 | req = tcp_sk(sk)->fastopen_rsk; | ||
319 | req->rsk_ops->syn_ack_timeout(sk, req); | ||
320 | |||
321 | if (req->retrans >= max_retries) { | ||
322 | tcp_write_err(sk); | ||
323 | return; | ||
324 | } | ||
325 | /* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error | ||
326 | * returned from rtx_syn_ack() to make it more persistent like | ||
327 | * regular retransmit because if the child socket has been accepted | ||
328 | * it's not good to give up too easily. | ||
329 | */ | ||
330 | req->rsk_ops->rtx_syn_ack(sk, req, NULL); | ||
331 | req->retrans++; | ||
332 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, | ||
333 | TCP_TIMEOUT_INIT << req->retrans, TCP_RTO_MAX); | ||
334 | } | ||
335 | |||
336 | /* | ||
308 | * The TCP retransmit timer. | 337 | * The TCP retransmit timer. |
309 | */ | 338 | */ |
310 | 339 | ||
@@ -317,7 +346,15 @@ void tcp_retransmit_timer(struct sock *sk) | |||
317 | tcp_resume_early_retransmit(sk); | 346 | tcp_resume_early_retransmit(sk); |
318 | return; | 347 | return; |
319 | } | 348 | } |
320 | 349 | if (tp->fastopen_rsk) { | |
350 | BUG_ON(sk->sk_state != TCP_SYN_RECV && | ||
351 | sk->sk_state != TCP_FIN_WAIT1); | ||
352 | tcp_fastopen_synack_timer(sk); | ||
353 | /* Before we receive ACK to our SYN-ACK don't retransmit | ||
354 | * anything else (e.g., data or FIN segments). | ||
355 | */ | ||
356 | return; | ||
357 | } | ||
321 | if (!tp->packets_out) | 358 | if (!tp->packets_out) |
322 | goto out; | 359 | goto out; |
323 | 360 | ||
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 6f6d1aca3c3d..c4e64328d8ba 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c | |||
@@ -2110,7 +2110,9 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, | |||
2110 | bucket, src, srcp, dest, destp, sp->sk_state, | 2110 | bucket, src, srcp, dest, destp, sp->sk_state, |
2111 | sk_wmem_alloc_get(sp), | 2111 | sk_wmem_alloc_get(sp), |
2112 | sk_rmem_alloc_get(sp), | 2112 | sk_rmem_alloc_get(sp), |
2113 | 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), | 2113 | 0, 0L, 0, |
2114 | from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)), | ||
2115 | 0, sock_i_ino(sp), | ||
2114 | atomic_read(&sp->sk_refcnt), sp, | 2116 | atomic_read(&sp->sk_refcnt), sp, |
2115 | atomic_read(&sp->sk_drops), len); | 2117 | atomic_read(&sp->sk_drops), len); |
2116 | } | 2118 | } |
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 16d0960062be..d2f336ea82ca 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c | |||
@@ -24,7 +24,9 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, | |||
24 | if (!inet_diag_bc_sk(bc, sk)) | 24 | if (!inet_diag_bc_sk(bc, sk)) |
25 | return 0; | 25 | return 0; |
26 | 26 | ||
27 | return inet_sk_diag_fill(sk, NULL, skb, req, NETLINK_CB(cb->skb).pid, | 27 | return inet_sk_diag_fill(sk, NULL, skb, req, |
28 | sk_user_ns(NETLINK_CB(cb->skb).ssk), | ||
29 | NETLINK_CB(cb->skb).pid, | ||
28 | cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); | 30 | cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); |
29 | } | 31 | } |
30 | 32 | ||
@@ -69,6 +71,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, | |||
69 | goto out; | 71 | goto out; |
70 | 72 | ||
71 | err = inet_sk_diag_fill(sk, NULL, rep, req, | 73 | err = inet_sk_diag_fill(sk, NULL, rep, req, |
74 | sk_user_ns(NETLINK_CB(in_skb).ssk), | ||
72 | NETLINK_CB(in_skb).pid, | 75 | NETLINK_CB(in_skb).pid, |
73 | nlh->nlmsg_seq, 0, nlh); | 76 | nlh->nlmsg_seq, 0, nlh); |
74 | if (err < 0) { | 77 | if (err < 0) { |