aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
committerJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
commit8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch)
treea8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /net/ipv4
parent406089d01562f1e2bf9f089fd7637009ebaad589 (diff)
Patched in Tegra support.
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig29
-rw-r--r--net/ipv4/Makefile6
-rw-r--r--net/ipv4/af_inet.c247
-rw-r--r--net/ipv4/ah4.c36
-rw-r--r--net/ipv4/arp.c90
-rw-r--r--net/ipv4/cipso_ipv4.c19
-rw-r--r--net/ipv4/devinet.c360
-rw-r--r--net/ipv4/esp4.c51
-rw-r--r--net/ipv4/fib_frontend.c188
-rw-r--r--net/ipv4/fib_rules.c59
-rw-r--r--net/ipv4/fib_semantics.c154
-rw-r--r--net/ipv4/fib_trie.c95
-rw-r--r--net/ipv4/gre.c10
-rw-r--r--net/ipv4/icmp.c217
-rw-r--r--net/ipv4/igmp.c81
-rw-r--r--net/ipv4/inet_connection_sock.c212
-rw-r--r--net/ipv4/inet_diag.c762
-rw-r--r--net/ipv4/inet_fragment.c11
-rw-r--r--net/ipv4/inet_hashtables.c38
-rw-r--r--net/ipv4/inet_lro.c10
-rw-r--r--net/ipv4/inet_timewait_sock.c8
-rw-r--r--net/ipv4/inetpeer.c152
-rw-r--r--net/ipv4/ip_forward.c7
-rw-r--r--net/ipv4/ip_fragment.c118
-rw-r--r--net/ipv4/ip_gre.c327
-rw-r--r--net/ipv4/ip_input.c60
-rw-r--r--net/ipv4/ip_options.c70
-rw-r--r--net/ipv4/ip_output.c212
-rw-r--r--net/ipv4/ip_sockglue.c149
-rw-r--r--net/ipv4/ip_vti.c942
-rw-r--r--net/ipv4/ipcomp.c25
-rw-r--r--net/ipv4/ipconfig.c197
-rw-r--r--net/ipv4/ipip.c402
-rw-r--r--net/ipv4/ipmr.c231
-rw-r--r--net/ipv4/netfilter.c57
-rw-r--r--net/ipv4/netfilter/Kconfig130
-rw-r--r--net/ipv4/netfilter/Makefile24
-rw-r--r--net/ipv4/netfilter/arp_tables.c15
-rw-r--r--net/ipv4/netfilter/ip_tables.c13
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c13
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c31
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c9
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c28
-rw-r--r--net/ipv4/netfilter/ipt_rpfilter.c141
-rw-r--r--net/ipv4/netfilter/iptable_filter.c19
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c10
-rw-r--r--net/ipv4/netfilter/iptable_nat.c329
-rw-r--r--net/ipv4/netfilter/iptable_raw.c10
-rw-r--r--net/ipv4/netfilter/iptable_security.c5
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c199
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c1
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c143
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c167
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c281
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c53
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_gre.c36
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_icmp.c27
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c38
-rw-r--r--net/ipv4/ping.c84
-rw-r--r--net/ipv4/proc.c30
-rw-r--r--net/ipv4/protocol.c29
-rw-r--r--net/ipv4/raw.c46
-rw-r--r--net/ipv4/route.c2462
-rw-r--r--net/ipv4/syncookies.c40
-rw-r--r--net/ipv4/sysctl_net_ipv4.c215
-rw-r--r--net/ipv4/tcp.c891
-rw-r--r--net/ipv4/tcp_bic.c11
-rw-r--r--net/ipv4/tcp_cong.c30
-rw-r--r--net/ipv4/tcp_cubic.c10
-rw-r--r--net/ipv4/tcp_diag.c20
-rw-r--r--net/ipv4/tcp_fastopen.c92
-rw-r--r--net/ipv4/tcp_hybla.c10
-rw-r--r--net/ipv4/tcp_illinois.c8
-rw-r--r--net/ipv4/tcp_input.c1805
-rw-r--r--net/ipv4/tcp_ipv4.c1044
-rw-r--r--net/ipv4/tcp_memcontrol.c291
-rw-r--r--net/ipv4/tcp_metrics.c1091
-rw-r--r--net/ipv4/tcp_minisocks.c183
-rw-r--r--net/ipv4/tcp_output.c742
-rw-r--r--net/ipv4/tcp_probe.c8
-rw-r--r--net/ipv4/tcp_timer.c143
-rw-r--r--net/ipv4/tunnel4.c18
-rw-r--r--net/ipv4/udp.c185
-rw-r--r--net/ipv4/udp_diag.c216
-rw-r--r--net/ipv4/udp_impl.h2
-rw-r--r--net/ipv4/udplite.c21
-rw-r--r--net/ipv4/xfrm4_mode_beet.c5
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c74
-rw-r--r--net/ipv4/xfrm4_policy.c54
-rw-r--r--net/ipv4/xfrm4_state.c1
-rw-r--r--net/ipv4/xfrm4_tunnel.c22
92 files changed, 5920 insertions, 11321 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 5a19aeb8609..cbb505ba932 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -163,6 +163,8 @@ config IP_PNP_RARP
163 operating on your network. Read 163 operating on your network. Read
164 <file:Documentation/filesystems/nfs/nfsroot.txt> for details. 164 <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
165 165
166# not yet ready..
167# bool ' IP: ARP support' CONFIG_IP_PNP_ARP
166config NET_IPIP 168config NET_IPIP
167 tristate "IP: tunneling" 169 tristate "IP: tunneling"
168 select INET_TUNNEL 170 select INET_TUNNEL
@@ -262,8 +264,8 @@ config ARPD
262 bool "IP: ARP daemon support" 264 bool "IP: ARP daemon support"
263 ---help--- 265 ---help---
264 The kernel maintains an internal cache which maps IP addresses to 266 The kernel maintains an internal cache which maps IP addresses to
265 hardware addresses on the local network, so that Ethernet 267 hardware addresses on the local network, so that Ethernet/Token Ring/
266 frames are sent to the proper address on the physical networking 268 etc. frames are sent to the proper address on the physical networking
267 layer. Normally, kernel uses the ARP protocol to resolve these 269 layer. Normally, kernel uses the ARP protocol to resolve these
268 mappings. 270 mappings.
269 271
@@ -310,20 +312,9 @@ config SYN_COOKIES
310 312
311 If unsure, say N. 313 If unsure, say N.
312 314
313config NET_IPVTI
314 tristate "Virtual (secure) IP: tunneling"
315 select INET_TUNNEL
316 depends on INET_XFRM_MODE_TUNNEL
317 ---help---
318 Tunneling means encapsulating data of one protocol type within
319 another protocol and sending it over a channel that understands the
320 encapsulating protocol. This can be used with xfrm mode tunnel to give
321 the notion of a secure tunnel for IPSEC and then use routing protocol
322 on top.
323
324config INET_AH 315config INET_AH
325 tristate "IP: AH transformation" 316 tristate "IP: AH transformation"
326 select XFRM_ALGO 317 select XFRM
327 select CRYPTO 318 select CRYPTO
328 select CRYPTO_HMAC 319 select CRYPTO_HMAC
329 select CRYPTO_MD5 320 select CRYPTO_MD5
@@ -335,7 +326,7 @@ config INET_AH
335 326
336config INET_ESP 327config INET_ESP
337 tristate "IP: ESP transformation" 328 tristate "IP: ESP transformation"
338 select XFRM_ALGO 329 select XFRM
339 select CRYPTO 330 select CRYPTO
340 select CRYPTO_AUTHENC 331 select CRYPTO_AUTHENC
341 select CRYPTO_HMAC 332 select CRYPTO_HMAC
@@ -418,14 +409,6 @@ config INET_TCP_DIAG
418 depends on INET_DIAG 409 depends on INET_DIAG
419 def_tristate INET_DIAG 410 def_tristate INET_DIAG
420 411
421config INET_UDP_DIAG
422 tristate "UDP: socket monitoring interface"
423 depends on INET_DIAG && (IPV6 || IPV6=n)
424 default n
425 ---help---
426 Support for UDP socket monitoring interface used by the ss tool.
427 If unsure, say Y.
428
429menuconfig TCP_CONG_ADVANCED 412menuconfig TCP_CONG_ADVANCED
430 bool "TCP: advanced congestion control" 413 bool "TCP: advanced congestion control"
431 ---help--- 414 ---help---
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 15ca63ec604..681084d76a9 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -7,20 +7,20 @@ obj-y := route.o inetpeer.o protocol.o \
7 ip_output.o ip_sockglue.o inet_hashtables.o \ 7 ip_output.o ip_sockglue.o inet_hashtables.o \
8 inet_timewait_sock.o inet_connection_sock.o \ 8 inet_timewait_sock.o inet_connection_sock.o \
9 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ 9 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
10 tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ 10 tcp_minisocks.o tcp_cong.o \
11 datagram.o raw.o udp.o udplite.o \ 11 datagram.o raw.o udp.o udplite.o \
12 arp.o icmp.o devinet.o af_inet.o igmp.o \ 12 arp.o icmp.o devinet.o af_inet.o igmp.o \
13 fib_frontend.o fib_semantics.o fib_trie.o \ 13 fib_frontend.o fib_semantics.o fib_trie.o \
14 inet_fragment.o ping.o 14 inet_fragment.o ping.o
15 15
16obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o 16obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
17obj-$(CONFIG_SYSFS) += sysfs_net_ipv4.o
17obj-$(CONFIG_PROC_FS) += proc.o 18obj-$(CONFIG_PROC_FS) += proc.o
18obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o 19obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
19obj-$(CONFIG_IP_MROUTE) += ipmr.o 20obj-$(CONFIG_IP_MROUTE) += ipmr.o
20obj-$(CONFIG_NET_IPIP) += ipip.o 21obj-$(CONFIG_NET_IPIP) += ipip.o
21obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o 22obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
22obj-$(CONFIG_NET_IPGRE) += ip_gre.o 23obj-$(CONFIG_NET_IPGRE) += ip_gre.o
23obj-$(CONFIG_NET_IPVTI) += ip_vti.o
24obj-$(CONFIG_SYN_COOKIES) += syncookies.o 24obj-$(CONFIG_SYN_COOKIES) += syncookies.o
25obj-$(CONFIG_INET_AH) += ah4.o 25obj-$(CONFIG_INET_AH) += ah4.o
26obj-$(CONFIG_INET_ESP) += esp4.o 26obj-$(CONFIG_INET_ESP) += esp4.o
@@ -35,7 +35,6 @@ obj-$(CONFIG_IP_PNP) += ipconfig.o
35obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/ 35obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/
36obj-$(CONFIG_INET_DIAG) += inet_diag.o 36obj-$(CONFIG_INET_DIAG) += inet_diag.o
37obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o 37obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
38obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
39obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o 38obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
40obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o 39obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
41obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o 40obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
@@ -49,7 +48,6 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
49obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o 48obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
50obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o 49obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
51obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o 50obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
52obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
53obj-$(CONFIG_NETLABEL) += cipso_ipv4.o 51obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
54 52
55obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ 53obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 24b384b7903..bf488051a8d 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -65,8 +65,6 @@
65 * 2 of the License, or (at your option) any later version. 65 * 2 of the License, or (at your option) any later version.
66 */ 66 */
67 67
68#define pr_fmt(fmt) "IPv4: " fmt
69
70#include <linux/err.h> 68#include <linux/err.h>
71#include <linux/errno.h> 69#include <linux/errno.h>
72#include <linux/types.h> 70#include <linux/types.h>
@@ -91,6 +89,7 @@
91#include <linux/slab.h> 89#include <linux/slab.h>
92 90
93#include <asm/uaccess.h> 91#include <asm/uaccess.h>
92#include <asm/system.h>
94 93
95#include <linux/inet.h> 94#include <linux/inet.h>
96#include <linux/igmp.h> 95#include <linux/igmp.h>
@@ -119,6 +118,19 @@
119#include <linux/mroute.h> 118#include <linux/mroute.h>
120#endif 119#endif
121 120
121#ifdef CONFIG_ANDROID_PARANOID_NETWORK
122#include <linux/android_aid.h>
123
124static inline int current_has_network(void)
125{
126 return in_egroup_p(AID_INET) || capable(CAP_NET_RAW);
127}
128#else
129static inline int current_has_network(void)
130{
131 return 1;
132}
133#endif
122 134
123/* The inetsw table contains everything that inet_create needs to 135/* The inetsw table contains everything that inet_create needs to
124 * build a new socket. 136 * build a new socket.
@@ -157,7 +169,6 @@ void inet_sock_destruct(struct sock *sk)
157 169
158 kfree(rcu_dereference_protected(inet->inet_opt, 1)); 170 kfree(rcu_dereference_protected(inet->inet_opt, 1));
159 dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); 171 dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
160 dst_release(sk->sk_rx_dst);
161 sk_refcnt_debug_dec(sk); 172 sk_refcnt_debug_dec(sk);
162} 173}
163EXPORT_SYMBOL(inet_sock_destruct); 174EXPORT_SYMBOL(inet_sock_destruct);
@@ -212,26 +223,6 @@ int inet_listen(struct socket *sock, int backlog)
212 * we can only allow the backlog to be adjusted. 223 * we can only allow the backlog to be adjusted.
213 */ 224 */
214 if (old_state != TCP_LISTEN) { 225 if (old_state != TCP_LISTEN) {
215 /* Check special setups for testing purpose to enable TFO w/o
216 * requiring TCP_FASTOPEN sockopt.
217 * Note that only TCP sockets (SOCK_STREAM) will reach here.
218 * Also fastopenq may already been allocated because this
219 * socket was in TCP_LISTEN state previously but was
220 * shutdown() (rather than close()).
221 */
222 if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 &&
223 inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) {
224 if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0)
225 err = fastopen_init_queue(sk, backlog);
226 else if ((sysctl_tcp_fastopen &
227 TFO_SERVER_WO_SOCKOPT2) != 0)
228 err = fastopen_init_queue(sk,
229 ((uint)sysctl_tcp_fastopen) >> 16);
230 else
231 err = 0;
232 if (err)
233 goto out;
234 }
235 err = inet_csk_listen_start(sk, backlog); 226 err = inet_csk_listen_start(sk, backlog);
236 if (err) 227 if (err)
237 goto out; 228 goto out;
@@ -263,21 +254,24 @@ void build_ehash_secret(void)
263} 254}
264EXPORT_SYMBOL(build_ehash_secret); 255EXPORT_SYMBOL(build_ehash_secret);
265 256
266static inline int inet_netns_ok(struct net *net, __u8 protocol) 257static inline int inet_netns_ok(struct net *net, int protocol)
267{ 258{
259 int hash;
268 const struct net_protocol *ipprot; 260 const struct net_protocol *ipprot;
269 261
270 if (net_eq(net, &init_net)) 262 if (net_eq(net, &init_net))
271 return 1; 263 return 1;
272 264
273 ipprot = rcu_dereference(inet_protos[protocol]); 265 hash = protocol & (MAX_INET_PROTOS - 1);
274 if (ipprot == NULL) { 266 ipprot = rcu_dereference(inet_protos[hash]);
267
268 if (ipprot == NULL)
275 /* raw IP is OK */ 269 /* raw IP is OK */
276 return 1; 270 return 1;
277 }
278 return ipprot->netns_ok; 271 return ipprot->netns_ok;
279} 272}
280 273
274
281/* 275/*
282 * Create an inet socket. 276 * Create an inet socket.
283 */ 277 */
@@ -294,6 +288,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,
294 int try_loading_module = 0; 288 int try_loading_module = 0;
295 int err; 289 int err;
296 290
291 if (!current_has_network())
292 return -EACCES;
293
297 if (unlikely(!inet_ehash_secret)) 294 if (unlikely(!inet_ehash_secret))
298 if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) 295 if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
299 build_ehash_secret(); 296 build_ehash_secret();
@@ -346,8 +343,7 @@ lookup_protocol:
346 } 343 }
347 344
348 err = -EPERM; 345 err = -EPERM;
349 if (sock->type == SOCK_RAW && !kern && 346 if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
350 !ns_capable(net->user_ns, CAP_NET_RAW))
351 goto out_rcu_unlock; 347 goto out_rcu_unlock;
352 348
353 err = -EAFNOSUPPORT; 349 err = -EAFNOSUPPORT;
@@ -370,7 +366,7 @@ lookup_protocol:
370 err = 0; 366 err = 0;
371 sk->sk_no_check = answer_no_check; 367 sk->sk_no_check = answer_no_check;
372 if (INET_PROTOSW_REUSE & answer_flags) 368 if (INET_PROTOSW_REUSE & answer_flags)
373 sk->sk_reuse = SK_CAN_REUSE; 369 sk->sk_reuse = 1;
374 370
375 inet = inet_sk(sk); 371 inet = inet_sk(sk);
376 inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; 372 inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
@@ -402,7 +398,6 @@ lookup_protocol:
402 inet->mc_all = 1; 398 inet->mc_all = 1;
403 inet->mc_index = 0; 399 inet->mc_index = 0;
404 inet->mc_list = NULL; 400 inet->mc_list = NULL;
405 inet->rcv_tos = 0;
406 401
407 sk_refcnt_debug_inc(sk); 402 sk_refcnt_debug_inc(sk);
408 403
@@ -474,7 +469,6 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
474 struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; 469 struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
475 struct sock *sk = sock->sk; 470 struct sock *sk = sock->sk;
476 struct inet_sock *inet = inet_sk(sk); 471 struct inet_sock *inet = inet_sk(sk);
477 struct net *net = sock_net(sk);
478 unsigned short snum; 472 unsigned short snum;
479 int chk_addr_ret; 473 int chk_addr_ret;
480 int err; 474 int err;
@@ -498,7 +492,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
498 goto out; 492 goto out;
499 } 493 }
500 494
501 chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr); 495 chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
502 496
503 /* Not specified by any standard per-se, however it breaks too 497 /* Not specified by any standard per-se, however it breaks too
504 * many applications when removed. It is unfortunate since 498 * many applications when removed. It is unfortunate since
@@ -518,8 +512,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
518 512
519 snum = ntohs(addr->sin_port); 513 snum = ntohs(addr->sin_port);
520 err = -EACCES; 514 err = -EACCES;
521 if (snum && snum < PROT_SOCK && 515 if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
522 !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
523 goto out; 516 goto out;
524 517
525 /* We keep a pair of addresses. rcv_saddr is the one 518 /* We keep a pair of addresses. rcv_saddr is the one
@@ -563,7 +556,7 @@ out:
563} 556}
564EXPORT_SYMBOL(inet_bind); 557EXPORT_SYMBOL(inet_bind);
565 558
566int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, 559int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
567 int addr_len, int flags) 560 int addr_len, int flags)
568{ 561{
569 struct sock *sk = sock->sk; 562 struct sock *sk = sock->sk;
@@ -575,16 +568,15 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
575 568
576 if (!inet_sk(sk)->inet_num && inet_autobind(sk)) 569 if (!inet_sk(sk)->inet_num && inet_autobind(sk))
577 return -EAGAIN; 570 return -EAGAIN;
578 return sk->sk_prot->connect(sk, uaddr, addr_len); 571 return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
579} 572}
580EXPORT_SYMBOL(inet_dgram_connect); 573EXPORT_SYMBOL(inet_dgram_connect);
581 574
582static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) 575static long inet_wait_for_connect(struct sock *sk, long timeo)
583{ 576{
584 DEFINE_WAIT(wait); 577 DEFINE_WAIT(wait);
585 578
586 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 579 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
587 sk->sk_write_pending += writebias;
588 580
589 /* Basic assumption: if someone sets sk->sk_err, he _must_ 581 /* Basic assumption: if someone sets sk->sk_err, he _must_
590 * change state of the socket from TCP_SYN_*. 582 * change state of the socket from TCP_SYN_*.
@@ -600,7 +592,6 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
600 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 592 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
601 } 593 }
602 finish_wait(sk_sleep(sk), &wait); 594 finish_wait(sk_sleep(sk), &wait);
603 sk->sk_write_pending -= writebias;
604 return timeo; 595 return timeo;
605} 596}
606 597
@@ -608,8 +599,8 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
608 * Connect to a remote host. There is regrettably still a little 599 * Connect to a remote host. There is regrettably still a little
609 * TCP 'magic' in here. 600 * TCP 'magic' in here.
610 */ 601 */
611int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, 602int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
612 int addr_len, int flags) 603 int addr_len, int flags)
613{ 604{
614 struct sock *sk = sock->sk; 605 struct sock *sk = sock->sk;
615 int err; 606 int err;
@@ -618,6 +609,8 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
618 if (addr_len < sizeof(uaddr->sa_family)) 609 if (addr_len < sizeof(uaddr->sa_family))
619 return -EINVAL; 610 return -EINVAL;
620 611
612 lock_sock(sk);
613
621 if (uaddr->sa_family == AF_UNSPEC) { 614 if (uaddr->sa_family == AF_UNSPEC) {
622 err = sk->sk_prot->disconnect(sk, flags); 615 err = sk->sk_prot->disconnect(sk, flags);
623 sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; 616 sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
@@ -657,12 +650,8 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
657 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); 650 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
658 651
659 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { 652 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
660 int writebias = (sk->sk_protocol == IPPROTO_TCP) &&
661 tcp_sk(sk)->fastopen_req &&
662 tcp_sk(sk)->fastopen_req->data ? 1 : 0;
663
664 /* Error code is set above */ 653 /* Error code is set above */
665 if (!timeo || !inet_wait_for_connect(sk, timeo, writebias)) 654 if (!timeo || !inet_wait_for_connect(sk, timeo))
666 goto out; 655 goto out;
667 656
668 err = sock_intr_errno(timeo); 657 err = sock_intr_errno(timeo);
@@ -684,6 +673,7 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
684 sock->state = SS_CONNECTED; 673 sock->state = SS_CONNECTED;
685 err = 0; 674 err = 0;
686out: 675out:
676 release_sock(sk);
687 return err; 677 return err;
688 678
689sock_error: 679sock_error:
@@ -693,18 +683,6 @@ sock_error:
693 sock->state = SS_DISCONNECTING; 683 sock->state = SS_DISCONNECTING;
694 goto out; 684 goto out;
695} 685}
696EXPORT_SYMBOL(__inet_stream_connect);
697
698int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
699 int addr_len, int flags)
700{
701 int err;
702
703 lock_sock(sock->sk);
704 err = __inet_stream_connect(sock, uaddr, addr_len, flags);
705 release_sock(sock->sk);
706 return err;
707}
708EXPORT_SYMBOL(inet_stream_connect); 686EXPORT_SYMBOL(inet_stream_connect);
709 687
710/* 688/*
@@ -724,8 +702,7 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)
724 702
725 sock_rps_record_flow(sk2); 703 sock_rps_record_flow(sk2);
726 WARN_ON(!((1 << sk2->sk_state) & 704 WARN_ON(!((1 << sk2->sk_state) &
727 (TCPF_ESTABLISHED | TCPF_SYN_RECV | 705 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE)));
728 TCPF_CLOSE_WAIT | TCPF_CLOSE)));
729 706
730 sock_graft(sk2, newsock); 707 sock_graft(sk2, newsock);
731 708
@@ -919,6 +896,7 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
919 case SIOCSIFPFLAGS: 896 case SIOCSIFPFLAGS:
920 case SIOCGIFPFLAGS: 897 case SIOCGIFPFLAGS:
921 case SIOCSIFFLAGS: 898 case SIOCSIFFLAGS:
899 case SIOCKILLADDR:
922 err = devinet_ioctl(net, cmd, (void __user *)arg); 900 err = devinet_ioctl(net, cmd, (void __user *)arg);
923 break; 901 break;
924 default: 902 default:
@@ -933,7 +911,7 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
933EXPORT_SYMBOL(inet_ioctl); 911EXPORT_SYMBOL(inet_ioctl);
934 912
935#ifdef CONFIG_COMPAT 913#ifdef CONFIG_COMPAT
936static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 914int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
937{ 915{
938 struct sock *sk = sock->sk; 916 struct sock *sk = sock->sk;
939 int err = -ENOIOCTLCMD; 917 int err = -ENOIOCTLCMD;
@@ -1124,11 +1102,13 @@ out:
1124 return; 1102 return;
1125 1103
1126out_permanent: 1104out_permanent:
1127 pr_err("Attempt to override permanent protocol %d\n", protocol); 1105 printk(KERN_ERR "Attempt to override permanent protocol %d.\n",
1106 protocol);
1128 goto out; 1107 goto out;
1129 1108
1130out_illegal: 1109out_illegal:
1131 pr_err("Ignoring attempt to register invalid socket type %d\n", 1110 printk(KERN_ERR
1111 "Ignoring attempt to register invalid socket type %d.\n",
1132 p->type); 1112 p->type);
1133 goto out; 1113 goto out;
1134} 1114}
@@ -1137,7 +1117,8 @@ EXPORT_SYMBOL(inet_register_protosw);
1137void inet_unregister_protosw(struct inet_protosw *p) 1117void inet_unregister_protosw(struct inet_protosw *p)
1138{ 1118{
1139 if (INET_PROTOSW_PERMANENT & p->flags) { 1119 if (INET_PROTOSW_PERMANENT & p->flags) {
1140 pr_err("Attempt to unregister permanent protocol %d\n", 1120 printk(KERN_ERR
1121 "Attempt to unregister permanent protocol %d.\n",
1141 p->protocol); 1122 p->protocol);
1142 } else { 1123 } else {
1143 spin_lock_bh(&inetsw_lock); 1124 spin_lock_bh(&inetsw_lock);
@@ -1186,8 +1167,8 @@ static int inet_sk_reselect_saddr(struct sock *sk)
1186 return 0; 1167 return 0;
1187 1168
1188 if (sysctl_ip_dynaddr > 1) { 1169 if (sysctl_ip_dynaddr > 1) {
1189 pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n", 1170 printk(KERN_INFO "%s(): shifting inet->saddr from %pI4 to %pI4\n",
1190 __func__, &old_saddr, &new_saddr); 1171 __func__, &old_saddr, &new_saddr);
1191 } 1172 }
1192 1173
1193 inet->inet_saddr = inet->inet_rcv_saddr = new_saddr; 1174 inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;
@@ -1254,8 +1235,8 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
1254 1235
1255static int inet_gso_send_check(struct sk_buff *skb) 1236static int inet_gso_send_check(struct sk_buff *skb)
1256{ 1237{
1257 const struct net_offload *ops;
1258 const struct iphdr *iph; 1238 const struct iphdr *iph;
1239 const struct net_protocol *ops;
1259 int proto; 1240 int proto;
1260 int ihl; 1241 int ihl;
1261 int err = -EINVAL; 1242 int err = -EINVAL;
@@ -1274,25 +1255,24 @@ static int inet_gso_send_check(struct sk_buff *skb)
1274 __skb_pull(skb, ihl); 1255 __skb_pull(skb, ihl);
1275 skb_reset_transport_header(skb); 1256 skb_reset_transport_header(skb);
1276 iph = ip_hdr(skb); 1257 iph = ip_hdr(skb);
1277 proto = iph->protocol; 1258 proto = iph->protocol & (MAX_INET_PROTOS - 1);
1278 err = -EPROTONOSUPPORT; 1259 err = -EPROTONOSUPPORT;
1279 1260
1280 rcu_read_lock(); 1261 rcu_read_lock();
1281 ops = rcu_dereference(inet_offloads[proto]); 1262 ops = rcu_dereference(inet_protos[proto]);
1282 if (likely(ops && ops->callbacks.gso_send_check)) 1263 if (likely(ops && ops->gso_send_check))
1283 err = ops->callbacks.gso_send_check(skb); 1264 err = ops->gso_send_check(skb);
1284 rcu_read_unlock(); 1265 rcu_read_unlock();
1285 1266
1286out: 1267out:
1287 return err; 1268 return err;
1288} 1269}
1289 1270
1290static struct sk_buff *inet_gso_segment(struct sk_buff *skb, 1271static struct sk_buff *inet_gso_segment(struct sk_buff *skb, u32 features)
1291 netdev_features_t features)
1292{ 1272{
1293 struct sk_buff *segs = ERR_PTR(-EINVAL); 1273 struct sk_buff *segs = ERR_PTR(-EINVAL);
1294 const struct net_offload *ops;
1295 struct iphdr *iph; 1274 struct iphdr *iph;
1275 const struct net_protocol *ops;
1296 int proto; 1276 int proto;
1297 int ihl; 1277 int ihl;
1298 int id; 1278 int id;
@@ -1324,13 +1304,13 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
1324 skb_reset_transport_header(skb); 1304 skb_reset_transport_header(skb);
1325 iph = ip_hdr(skb); 1305 iph = ip_hdr(skb);
1326 id = ntohs(iph->id); 1306 id = ntohs(iph->id);
1327 proto = iph->protocol; 1307 proto = iph->protocol & (MAX_INET_PROTOS - 1);
1328 segs = ERR_PTR(-EPROTONOSUPPORT); 1308 segs = ERR_PTR(-EPROTONOSUPPORT);
1329 1309
1330 rcu_read_lock(); 1310 rcu_read_lock();
1331 ops = rcu_dereference(inet_offloads[proto]); 1311 ops = rcu_dereference(inet_protos[proto]);
1332 if (likely(ops && ops->callbacks.gso_segment)) 1312 if (likely(ops && ops->gso_segment))
1333 segs = ops->callbacks.gso_segment(skb, features); 1313 segs = ops->gso_segment(skb, features);
1334 rcu_read_unlock(); 1314 rcu_read_unlock();
1335 1315
1336 if (!segs || IS_ERR(segs)) 1316 if (!segs || IS_ERR(segs))
@@ -1359,7 +1339,7 @@ out:
1359static struct sk_buff **inet_gro_receive(struct sk_buff **head, 1339static struct sk_buff **inet_gro_receive(struct sk_buff **head,
1360 struct sk_buff *skb) 1340 struct sk_buff *skb)
1361{ 1341{
1362 const struct net_offload *ops; 1342 const struct net_protocol *ops;
1363 struct sk_buff **pp = NULL; 1343 struct sk_buff **pp = NULL;
1364 struct sk_buff *p; 1344 struct sk_buff *p;
1365 const struct iphdr *iph; 1345 const struct iphdr *iph;
@@ -1378,17 +1358,17 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
1378 goto out; 1358 goto out;
1379 } 1359 }
1380 1360
1381 proto = iph->protocol; 1361 proto = iph->protocol & (MAX_INET_PROTOS - 1);
1382 1362
1383 rcu_read_lock(); 1363 rcu_read_lock();
1384 ops = rcu_dereference(inet_offloads[proto]); 1364 ops = rcu_dereference(inet_protos[proto]);
1385 if (!ops || !ops->callbacks.gro_receive) 1365 if (!ops || !ops->gro_receive)
1386 goto out_unlock; 1366 goto out_unlock;
1387 1367
1388 if (*(u8 *)iph != 0x45) 1368 if (*(u8 *)iph != 0x45)
1389 goto out_unlock; 1369 goto out_unlock;
1390 1370
1391 if (unlikely(ip_fast_csum((u8 *)iph, 5))) 1371 if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
1392 goto out_unlock; 1372 goto out_unlock;
1393 1373
1394 id = ntohl(*(__be32 *)&iph->id); 1374 id = ntohl(*(__be32 *)&iph->id);
@@ -1404,6 +1384,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
1404 iph2 = ip_hdr(p); 1384 iph2 = ip_hdr(p);
1405 1385
1406 if ((iph->protocol ^ iph2->protocol) | 1386 if ((iph->protocol ^ iph2->protocol) |
1387 (iph->tos ^ iph2->tos) |
1407 ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) | 1388 ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
1408 ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) { 1389 ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
1409 NAPI_GRO_CB(p)->same_flow = 0; 1390 NAPI_GRO_CB(p)->same_flow = 0;
@@ -1413,7 +1394,6 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
1413 /* All fields must match except length and checksum. */ 1394 /* All fields must match except length and checksum. */
1414 NAPI_GRO_CB(p)->flush |= 1395 NAPI_GRO_CB(p)->flush |=
1415 (iph->ttl ^ iph2->ttl) | 1396 (iph->ttl ^ iph2->ttl) |
1416 (iph->tos ^ iph2->tos) |
1417 ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id); 1397 ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
1418 1398
1419 NAPI_GRO_CB(p)->flush |= flush; 1399 NAPI_GRO_CB(p)->flush |= flush;
@@ -1423,7 +1403,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
1423 skb_gro_pull(skb, sizeof(*iph)); 1403 skb_gro_pull(skb, sizeof(*iph));
1424 skb_set_transport_header(skb, skb_gro_offset(skb)); 1404 skb_set_transport_header(skb, skb_gro_offset(skb));
1425 1405
1426 pp = ops->callbacks.gro_receive(head, skb); 1406 pp = ops->gro_receive(head, skb);
1427 1407
1428out_unlock: 1408out_unlock:
1429 rcu_read_unlock(); 1409 rcu_read_unlock();
@@ -1436,21 +1416,21 @@ out:
1436 1416
1437static int inet_gro_complete(struct sk_buff *skb) 1417static int inet_gro_complete(struct sk_buff *skb)
1438{ 1418{
1439 __be16 newlen = htons(skb->len - skb_network_offset(skb)); 1419 const struct net_protocol *ops;
1440 struct iphdr *iph = ip_hdr(skb); 1420 struct iphdr *iph = ip_hdr(skb);
1441 const struct net_offload *ops; 1421 int proto = iph->protocol & (MAX_INET_PROTOS - 1);
1442 int proto = iph->protocol;
1443 int err = -ENOSYS; 1422 int err = -ENOSYS;
1423 __be16 newlen = htons(skb->len - skb_network_offset(skb));
1444 1424
1445 csum_replace2(&iph->check, iph->tot_len, newlen); 1425 csum_replace2(&iph->check, iph->tot_len, newlen);
1446 iph->tot_len = newlen; 1426 iph->tot_len = newlen;
1447 1427
1448 rcu_read_lock(); 1428 rcu_read_lock();
1449 ops = rcu_dereference(inet_offloads[proto]); 1429 ops = rcu_dereference(inet_protos[proto]);
1450 if (WARN_ON(!ops || !ops->callbacks.gro_complete)) 1430 if (WARN_ON(!ops || !ops->gro_complete))
1451 goto out_unlock; 1431 goto out_unlock;
1452 1432
1453 err = ops->callbacks.gro_complete(skb); 1433 err = ops->gro_complete(skb);
1454 1434
1455out_unlock: 1435out_unlock:
1456 rcu_read_unlock(); 1436 rcu_read_unlock();
@@ -1558,36 +1538,25 @@ static const struct net_protocol igmp_protocol = {
1558#endif 1538#endif
1559 1539
1560static const struct net_protocol tcp_protocol = { 1540static const struct net_protocol tcp_protocol = {
1561 .early_demux = tcp_v4_early_demux, 1541 .handler = tcp_v4_rcv,
1562 .handler = tcp_v4_rcv, 1542 .err_handler = tcp_v4_err,
1563 .err_handler = tcp_v4_err, 1543 .gso_send_check = tcp_v4_gso_send_check,
1564 .no_policy = 1, 1544 .gso_segment = tcp_tso_segment,
1565 .netns_ok = 1, 1545 .gro_receive = tcp4_gro_receive,
1566}; 1546 .gro_complete = tcp4_gro_complete,
1567 1547 .no_policy = 1,
1568static const struct net_offload tcp_offload = { 1548 .netns_ok = 1,
1569 .callbacks = {
1570 .gso_send_check = tcp_v4_gso_send_check,
1571 .gso_segment = tcp_tso_segment,
1572 .gro_receive = tcp4_gro_receive,
1573 .gro_complete = tcp4_gro_complete,
1574 },
1575}; 1549};
1576 1550
1577static const struct net_protocol udp_protocol = { 1551static const struct net_protocol udp_protocol = {
1578 .handler = udp_rcv, 1552 .handler = udp_rcv,
1579 .err_handler = udp_err, 1553 .err_handler = udp_err,
1554 .gso_send_check = udp4_ufo_send_check,
1555 .gso_segment = udp4_ufo_fragment,
1580 .no_policy = 1, 1556 .no_policy = 1,
1581 .netns_ok = 1, 1557 .netns_ok = 1,
1582}; 1558};
1583 1559
1584static const struct net_offload udp_offload = {
1585 .callbacks = {
1586 .gso_send_check = udp4_ufo_send_check,
1587 .gso_segment = udp4_ufo_fragment,
1588 },
1589};
1590
1591static const struct net_protocol icmp_protocol = { 1560static const struct net_protocol icmp_protocol = {
1592 .handler = icmp_rcv, 1561 .handler = icmp_rcv,
1593 .err_handler = ping_err, 1562 .err_handler = ping_err,
@@ -1621,9 +1590,9 @@ static __net_init int ipv4_mib_init_net(struct net *net)
1621 sizeof(struct icmp_mib), 1590 sizeof(struct icmp_mib),
1622 __alignof__(struct icmp_mib)) < 0) 1591 __alignof__(struct icmp_mib)) < 0)
1623 goto err_icmp_mib; 1592 goto err_icmp_mib;
1624 net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib), 1593 if (snmp_mib_init((void __percpu **)net->mib.icmpmsg_statistics,
1625 GFP_KERNEL); 1594 sizeof(struct icmpmsg_mib),
1626 if (!net->mib.icmpmsg_statistics) 1595 __alignof__(struct icmpmsg_mib)) < 0)
1627 goto err_icmpmsg_mib; 1596 goto err_icmpmsg_mib;
1628 1597
1629 tcp_mib_init(net); 1598 tcp_mib_init(net);
@@ -1647,7 +1616,7 @@ err_tcp_mib:
1647 1616
1648static __net_exit void ipv4_mib_exit_net(struct net *net) 1617static __net_exit void ipv4_mib_exit_net(struct net *net)
1649{ 1618{
1650 kfree(net->mib.icmpmsg_statistics); 1619 snmp_mib_free((void __percpu **)net->mib.icmpmsg_statistics);
1651 snmp_mib_free((void __percpu **)net->mib.icmp_statistics); 1620 snmp_mib_free((void __percpu **)net->mib.icmp_statistics);
1652 snmp_mib_free((void __percpu **)net->mib.udplite_statistics); 1621 snmp_mib_free((void __percpu **)net->mib.udplite_statistics);
1653 snmp_mib_free((void __percpu **)net->mib.udp_statistics); 1622 snmp_mib_free((void __percpu **)net->mib.udp_statistics);
@@ -1672,35 +1641,13 @@ static int ipv4_proc_init(void);
1672 * IP protocol layer initialiser 1641 * IP protocol layer initialiser
1673 */ 1642 */
1674 1643
1675static struct packet_offload ip_packet_offload __read_mostly = {
1676 .type = cpu_to_be16(ETH_P_IP),
1677 .callbacks = {
1678 .gso_send_check = inet_gso_send_check,
1679 .gso_segment = inet_gso_segment,
1680 .gro_receive = inet_gro_receive,
1681 .gro_complete = inet_gro_complete,
1682 },
1683};
1684
1685static int __init ipv4_offload_init(void)
1686{
1687 /*
1688 * Add offloads
1689 */
1690 if (inet_add_offload(&udp_offload, IPPROTO_UDP) < 0)
1691 pr_crit("%s: Cannot add UDP protocol offload\n", __func__);
1692 if (inet_add_offload(&tcp_offload, IPPROTO_TCP) < 0)
1693 pr_crit("%s: Cannot add TCP protocol offlaod\n", __func__);
1694
1695 dev_add_offload(&ip_packet_offload);
1696 return 0;
1697}
1698
1699fs_initcall(ipv4_offload_init);
1700
1701static struct packet_type ip_packet_type __read_mostly = { 1644static struct packet_type ip_packet_type __read_mostly = {
1702 .type = cpu_to_be16(ETH_P_IP), 1645 .type = cpu_to_be16(ETH_P_IP),
1703 .func = ip_rcv, 1646 .func = ip_rcv,
1647 .gso_send_check = inet_gso_send_check,
1648 .gso_segment = inet_gso_segment,
1649 .gro_receive = inet_gro_receive,
1650 .gro_complete = inet_gro_complete,
1704}; 1651};
1705 1652
1706static int __init inet_init(void) 1653static int __init inet_init(void)
@@ -1742,21 +1689,19 @@ static int __init inet_init(void)
1742 ip_static_sysctl_init(); 1689 ip_static_sysctl_init();
1743#endif 1690#endif
1744 1691
1745 tcp_prot.sysctl_mem = init_net.ipv4.sysctl_tcp_mem;
1746
1747 /* 1692 /*
1748 * Add all the base protocols. 1693 * Add all the base protocols.
1749 */ 1694 */
1750 1695
1751 if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0) 1696 if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
1752 pr_crit("%s: Cannot add ICMP protocol\n", __func__); 1697 printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n");
1753 if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0) 1698 if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
1754 pr_crit("%s: Cannot add UDP protocol\n", __func__); 1699 printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n");
1755 if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0) 1700 if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
1756 pr_crit("%s: Cannot add TCP protocol\n", __func__); 1701 printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n");
1757#ifdef CONFIG_IP_MULTICAST 1702#ifdef CONFIG_IP_MULTICAST
1758 if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0) 1703 if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
1759 pr_crit("%s: Cannot add IGMP protocol\n", __func__); 1704 printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n");
1760#endif 1705#endif
1761 1706
1762 /* Register the socket-side information for inet_create. */ 1707 /* Register the socket-side information for inet_create. */
@@ -1803,14 +1748,14 @@ static int __init inet_init(void)
1803 */ 1748 */
1804#if defined(CONFIG_IP_MROUTE) 1749#if defined(CONFIG_IP_MROUTE)
1805 if (ip_mr_init()) 1750 if (ip_mr_init())
1806 pr_crit("%s: Cannot init ipv4 mroute\n", __func__); 1751 printk(KERN_CRIT "inet_init: Cannot init ipv4 mroute\n");
1807#endif 1752#endif
1808 /* 1753 /*
1809 * Initialise per-cpu ipv4 mibs 1754 * Initialise per-cpu ipv4 mibs
1810 */ 1755 */
1811 1756
1812 if (init_ipv4_mibs()) 1757 if (init_ipv4_mibs())
1813 pr_crit("%s: Cannot init ipv4 mibs\n", __func__); 1758 printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n");
1814 1759
1815 ipv4_proc_init(); 1760 ipv4_proc_init();
1816 1761
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index a0d8392491c..36d14406261 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -1,5 +1,3 @@
1#define pr_fmt(fmt) "IPsec: " fmt
2
3#include <crypto/hash.h> 1#include <crypto/hash.h>
4#include <linux/err.h> 2#include <linux/err.h>
5#include <linux/module.h> 3#include <linux/module.h>
@@ -77,7 +75,7 @@ static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash,
77 75
78static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr) 76static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr)
79{ 77{
80 unsigned char *optptr = (unsigned char *)(iph+1); 78 unsigned char * optptr = (unsigned char*)(iph+1);
81 int l = iph->ihl*4 - sizeof(struct iphdr); 79 int l = iph->ihl*4 - sizeof(struct iphdr);
82 int optlen; 80 int optlen;
83 81
@@ -398,25 +396,16 @@ static void ah4_err(struct sk_buff *skb, u32 info)
398 struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); 396 struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
399 struct xfrm_state *x; 397 struct xfrm_state *x;
400 398
401 switch (icmp_hdr(skb)->type) { 399 if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH ||
402 case ICMP_DEST_UNREACH: 400 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
403 if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
404 return;
405 case ICMP_REDIRECT:
406 break;
407 default:
408 return; 401 return;
409 }
410 402
411 x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, 403 x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
412 ah->spi, IPPROTO_AH, AF_INET); 404 ah->spi, IPPROTO_AH, AF_INET);
413 if (!x) 405 if (!x)
414 return; 406 return;
415 407 printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n",
416 if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) 408 ntohl(ah->spi), ntohl(iph->daddr));
417 ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0);
418 else
419 ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0);
420 xfrm_state_put(x); 409 xfrm_state_put(x);
421} 410}
422 411
@@ -456,10 +445,9 @@ static int ah_init_state(struct xfrm_state *x)
456 445
457 if (aalg_desc->uinfo.auth.icv_fullbits/8 != 446 if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
458 crypto_ahash_digestsize(ahash)) { 447 crypto_ahash_digestsize(ahash)) {
459 pr_info("%s: %s digestsize %u != %hu\n", 448 printk(KERN_INFO "AH: %s digestsize %u != %hu\n",
460 __func__, x->aalg->alg_name, 449 x->aalg->alg_name, crypto_ahash_digestsize(ahash),
461 crypto_ahash_digestsize(ahash), 450 aalg_desc->uinfo.auth.icv_fullbits/8);
462 aalg_desc->uinfo.auth.icv_fullbits / 8);
463 goto error; 451 goto error;
464 } 452 }
465 453
@@ -522,11 +510,11 @@ static const struct net_protocol ah4_protocol = {
522static int __init ah4_init(void) 510static int __init ah4_init(void)
523{ 511{
524 if (xfrm_register_type(&ah_type, AF_INET) < 0) { 512 if (xfrm_register_type(&ah_type, AF_INET) < 0) {
525 pr_info("%s: can't add xfrm type\n", __func__); 513 printk(KERN_INFO "ip ah init: can't add xfrm type\n");
526 return -EAGAIN; 514 return -EAGAIN;
527 } 515 }
528 if (inet_add_protocol(&ah4_protocol, IPPROTO_AH) < 0) { 516 if (inet_add_protocol(&ah4_protocol, IPPROTO_AH) < 0) {
529 pr_info("%s: can't add protocol\n", __func__); 517 printk(KERN_INFO "ip ah init: can't add protocol\n");
530 xfrm_unregister_type(&ah_type, AF_INET); 518 xfrm_unregister_type(&ah_type, AF_INET);
531 return -EAGAIN; 519 return -EAGAIN;
532 } 520 }
@@ -536,9 +524,9 @@ static int __init ah4_init(void)
536static void __exit ah4_fini(void) 524static void __exit ah4_fini(void)
537{ 525{
538 if (inet_del_protocol(&ah4_protocol, IPPROTO_AH) < 0) 526 if (inet_del_protocol(&ah4_protocol, IPPROTO_AH) < 0)
539 pr_info("%s: can't remove protocol\n", __func__); 527 printk(KERN_INFO "ip ah close: can't remove protocol\n");
540 if (xfrm_unregister_type(&ah_type, AF_INET) < 0) 528 if (xfrm_unregister_type(&ah_type, AF_INET) < 0)
541 pr_info("%s: can't remove xfrm type\n", __func__); 529 printk(KERN_INFO "ip ah close: can't remove xfrm type\n");
542} 530}
543 531
544module_init(ah4_init); 532module_init(ah4_init);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 9547a273b9e..96a164aa136 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -73,8 +73,6 @@
73 * Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support. 73 * Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support.
74 */ 74 */
75 75
76#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
77
78#include <linux/module.h> 76#include <linux/module.h>
79#include <linux/types.h> 77#include <linux/types.h>
80#include <linux/string.h> 78#include <linux/string.h>
@@ -91,6 +89,7 @@
91#include <linux/etherdevice.h> 89#include <linux/etherdevice.h>
92#include <linux/fddidevice.h> 90#include <linux/fddidevice.h>
93#include <linux/if_arp.h> 91#include <linux/if_arp.h>
92#include <linux/trdevice.h>
94#include <linux/skbuff.h> 93#include <linux/skbuff.h>
95#include <linux/proc_fs.h> 94#include <linux/proc_fs.h>
96#include <linux/seq_file.h> 95#include <linux/seq_file.h>
@@ -113,7 +112,13 @@
113#include <net/arp.h> 112#include <net/arp.h>
114#include <net/ax25.h> 113#include <net/ax25.h>
115#include <net/netrom.h> 114#include <net/netrom.h>
115#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
116#include <net/atmclip.h>
117struct neigh_table *clip_tbl_hook;
118EXPORT_SYMBOL(clip_tbl_hook);
119#endif
116 120
121#include <asm/system.h>
117#include <linux/uaccess.h> 122#include <linux/uaccess.h>
118 123
119#include <linux/netfilter_arp.h> 124#include <linux/netfilter_arp.h>
@@ -121,7 +126,7 @@
121/* 126/*
122 * Interface to generic neighbour cache. 127 * Interface to generic neighbour cache.
123 */ 128 */
124static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); 129static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 rnd);
125static int arp_constructor(struct neighbour *neigh); 130static int arp_constructor(struct neighbour *neigh);
126static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); 131static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
127static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); 132static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
@@ -159,6 +164,7 @@ static const struct neigh_ops arp_broken_ops = {
159 164
160struct neigh_table arp_tbl = { 165struct neigh_table arp_tbl = {
161 .family = AF_INET, 166 .family = AF_INET,
167 .entry_size = sizeof(struct neighbour) + 4,
162 .key_len = 4, 168 .key_len = 4,
163 .hash = arp_hash, 169 .hash = arp_hash,
164 .constructor = arp_constructor, 170 .constructor = arp_constructor,
@@ -171,7 +177,7 @@ struct neigh_table arp_tbl = {
171 .gc_staletime = 60 * HZ, 177 .gc_staletime = 60 * HZ,
172 .reachable_time = 30 * HZ, 178 .reachable_time = 30 * HZ,
173 .delay_probe_time = 5 * HZ, 179 .delay_probe_time = 5 * HZ,
174 .queue_len_bytes = 64*1024, 180 .queue_len = 3,
175 .ucast_probes = 3, 181 .ucast_probes = 3,
176 .mcast_probes = 3, 182 .mcast_probes = 3,
177 .anycast_delay = 1 * HZ, 183 .anycast_delay = 1 * HZ,
@@ -194,6 +200,9 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
194 case ARPHRD_IEEE802: 200 case ARPHRD_IEEE802:
195 ip_eth_mc_map(addr, haddr); 201 ip_eth_mc_map(addr, haddr);
196 return 0; 202 return 0;
203 case ARPHRD_IEEE802_TR:
204 ip_tr_mc_map(addr, haddr);
205 return 0;
197 case ARPHRD_INFINIBAND: 206 case ARPHRD_INFINIBAND:
198 ip_ib_mc_map(addr, dev->broadcast, haddr); 207 ip_ib_mc_map(addr, dev->broadcast, haddr);
199 return 0; 208 return 0;
@@ -212,9 +221,9 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
212 221
213static u32 arp_hash(const void *pkey, 222static u32 arp_hash(const void *pkey,
214 const struct net_device *dev, 223 const struct net_device *dev,
215 __u32 *hash_rnd) 224 __u32 hash_rnd)
216{ 225{
217 return arp_hashfn(*(u32 *)pkey, dev, *hash_rnd); 226 return arp_hashfn(*(u32 *)pkey, dev, hash_rnd);
218} 227}
219 228
220static int arp_constructor(struct neighbour *neigh) 229static int arp_constructor(struct neighbour *neigh)
@@ -274,9 +283,9 @@ static int arp_constructor(struct neighbour *neigh)
274 default: 283 default:
275 break; 284 break;
276 case ARPHRD_ROSE: 285 case ARPHRD_ROSE:
277#if IS_ENABLED(CONFIG_AX25) 286#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
278 case ARPHRD_AX25: 287 case ARPHRD_AX25:
279#if IS_ENABLED(CONFIG_NETROM) 288#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
280 case ARPHRD_NETROM: 289 case ARPHRD_NETROM:
281#endif 290#endif
282 neigh->ops = &arp_broken_ops; 291 neigh->ops = &arp_broken_ops;
@@ -321,7 +330,7 @@ static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb)
321static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) 330static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
322{ 331{
323 __be32 saddr = 0; 332 __be32 saddr = 0;
324 u8 dst_ha[MAX_ADDR_LEN], *dst_hw = NULL; 333 u8 *dst_ha = NULL;
325 struct net_device *dev = neigh->dev; 334 struct net_device *dev = neigh->dev;
326 __be32 target = *(__be32 *)neigh->primary_key; 335 __be32 target = *(__be32 *)neigh->primary_key;
327 int probes = atomic_read(&neigh->probes); 336 int probes = atomic_read(&neigh->probes);
@@ -362,9 +371,10 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
362 probes -= neigh->parms->ucast_probes; 371 probes -= neigh->parms->ucast_probes;
363 if (probes < 0) { 372 if (probes < 0) {
364 if (!(neigh->nud_state & NUD_VALID)) 373 if (!(neigh->nud_state & NUD_VALID))
365 pr_debug("trying to ucast probe in NUD_INVALID\n"); 374 printk(KERN_DEBUG
366 neigh_ha_snapshot(dst_ha, neigh, dev); 375 "trying to ucast probe in NUD_INVALID\n");
367 dst_hw = dst_ha; 376 dst_ha = neigh->ha;
377 read_lock_bh(&neigh->lock);
368 } else { 378 } else {
369 probes -= neigh->parms->app_probes; 379 probes -= neigh->parms->app_probes;
370 if (probes < 0) { 380 if (probes < 0) {
@@ -376,7 +386,9 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
376 } 386 }
377 387
378 arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, 388 arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
379 dst_hw, dev->dev_addr, NULL); 389 dst_ha, dev->dev_addr, NULL);
390 if (dst_ha)
391 read_unlock_bh(&neigh->lock);
380} 392}
381 393
382static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) 394static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
@@ -447,7 +459,7 @@ static int arp_set_predefined(int addr_hint, unsigned char *haddr,
447{ 459{
448 switch (addr_hint) { 460 switch (addr_hint) {
449 case RTN_LOCAL: 461 case RTN_LOCAL:
450 pr_debug("arp called for own IP address\n"); 462 printk(KERN_DEBUG "ARP: arp called for own IP address\n");
451 memcpy(haddr, dev->dev_addr, dev->addr_len); 463 memcpy(haddr, dev->dev_addr, dev->addr_len);
452 return 1; 464 return 1;
453 case RTN_MULTICAST: 465 case RTN_MULTICAST:
@@ -468,12 +480,13 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
468 struct neighbour *n; 480 struct neighbour *n;
469 481
470 if (!skb_dst(skb)) { 482 if (!skb_dst(skb)) {
471 pr_debug("arp_find is called with dst==NULL\n"); 483 printk(KERN_DEBUG "arp_find is called with dst==NULL\n");
472 kfree_skb(skb); 484 kfree_skb(skb);
473 return 1; 485 return 1;
474 } 486 }
475 487
476 paddr = rt_nexthop(skb_rtable(skb), ip_hdr(skb)->daddr); 488 paddr = skb_rtable(skb)->rt_gateway;
489
477 if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, 490 if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr,
478 paddr, dev)) 491 paddr, dev))
479 return 0; 492 return 0;
@@ -579,18 +592,16 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
579 struct sk_buff *skb; 592 struct sk_buff *skb;
580 struct arphdr *arp; 593 struct arphdr *arp;
581 unsigned char *arp_ptr; 594 unsigned char *arp_ptr;
582 int hlen = LL_RESERVED_SPACE(dev);
583 int tlen = dev->needed_tailroom;
584 595
585 /* 596 /*
586 * Allocate a buffer 597 * Allocate a buffer
587 */ 598 */
588 599
589 skb = alloc_skb(arp_hdr_len(dev) + hlen + tlen, GFP_ATOMIC); 600 skb = alloc_skb(arp_hdr_len(dev) + LL_ALLOCATED_SPACE(dev), GFP_ATOMIC);
590 if (skb == NULL) 601 if (skb == NULL)
591 return NULL; 602 return NULL;
592 603
593 skb_reserve(skb, hlen); 604 skb_reserve(skb, LL_RESERVED_SPACE(dev));
594 skb_reset_network_header(skb); 605 skb_reset_network_header(skb);
595 arp = (struct arphdr *) skb_put(skb, arp_hdr_len(dev)); 606 arp = (struct arphdr *) skb_put(skb, arp_hdr_len(dev));
596 skb->dev = dev; 607 skb->dev = dev;
@@ -622,13 +633,13 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
622 arp->ar_pro = htons(ETH_P_IP); 633 arp->ar_pro = htons(ETH_P_IP);
623 break; 634 break;
624 635
625#if IS_ENABLED(CONFIG_AX25) 636#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
626 case ARPHRD_AX25: 637 case ARPHRD_AX25:
627 arp->ar_hrd = htons(ARPHRD_AX25); 638 arp->ar_hrd = htons(ARPHRD_AX25);
628 arp->ar_pro = htons(AX25_P_IP); 639 arp->ar_pro = htons(AX25_P_IP);
629 break; 640 break;
630 641
631#if IS_ENABLED(CONFIG_NETROM) 642#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
632 case ARPHRD_NETROM: 643 case ARPHRD_NETROM:
633 arp->ar_hrd = htons(ARPHRD_NETROM); 644 arp->ar_hrd = htons(ARPHRD_NETROM);
634 arp->ar_pro = htons(AX25_P_IP); 645 arp->ar_pro = htons(AX25_P_IP);
@@ -636,12 +647,18 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
636#endif 647#endif
637#endif 648#endif
638 649
639#if IS_ENABLED(CONFIG_FDDI) 650#if defined(CONFIG_FDDI) || defined(CONFIG_FDDI_MODULE)
640 case ARPHRD_FDDI: 651 case ARPHRD_FDDI:
641 arp->ar_hrd = htons(ARPHRD_ETHER); 652 arp->ar_hrd = htons(ARPHRD_ETHER);
642 arp->ar_pro = htons(ETH_P_IP); 653 arp->ar_pro = htons(ETH_P_IP);
643 break; 654 break;
644#endif 655#endif
656#if defined(CONFIG_TR) || defined(CONFIG_TR_MODULE)
657 case ARPHRD_IEEE802_TR:
658 arp->ar_hrd = htons(ARPHRD_IEEE802);
659 arp->ar_pro = htons(ETH_P_IP);
660 break;
661#endif
645 } 662 }
646 663
647 arp->ar_hln = dev->addr_len; 664 arp->ar_hln = dev->addr_len;
@@ -739,10 +756,11 @@ static int arp_process(struct sk_buff *skb)
739 goto out; 756 goto out;
740 break; 757 break;
741 case ARPHRD_ETHER: 758 case ARPHRD_ETHER:
759 case ARPHRD_IEEE802_TR:
742 case ARPHRD_FDDI: 760 case ARPHRD_FDDI:
743 case ARPHRD_IEEE802: 761 case ARPHRD_IEEE802:
744 /* 762 /*
745 * ETHERNET, and Fibre Channel (which are IEEE 802 763 * ETHERNET, Token Ring and Fibre Channel (which are IEEE 802
746 * devices, according to RFC 2625) devices will accept ARP 764 * devices, according to RFC 2625) devices will accept ARP
747 * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2). 765 * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2).
748 * This is the case also of FDDI, where the RFC 1390 says that 766 * This is the case also of FDDI, where the RFC 1390 says that
@@ -787,8 +805,7 @@ static int arp_process(struct sk_buff *skb)
787 * Check for bad requests for 127.x.x.x and requests for multicast 805 * Check for bad requests for 127.x.x.x and requests for multicast
788 * addresses. If this is one such, delete it. 806 * addresses. If this is one such, delete it.
789 */ 807 */
790 if (ipv4_is_multicast(tip) || 808 if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip))
791 (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip)))
792 goto out; 809 goto out;
793 810
794/* 811/*
@@ -850,8 +867,7 @@ static int arp_process(struct sk_buff *skb)
850 if (addr_type == RTN_UNICAST && 867 if (addr_type == RTN_UNICAST &&
851 (arp_fwd_proxy(in_dev, dev, rt) || 868 (arp_fwd_proxy(in_dev, dev, rt) ||
852 arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || 869 arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
853 (rt->dst.dev != dev && 870 pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) {
854 pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) {
855 n = neigh_event_ns(&arp_tbl, sha, &sip, dev); 871 n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
856 if (n) 872 if (n)
857 neigh_release(n); 873 neigh_release(n);
@@ -876,7 +892,7 @@ static int arp_process(struct sk_buff *skb)
876 892
877 n = __neigh_lookup(&arp_tbl, &sip, dev, 0); 893 n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
878 894
879 if (IN_DEV_ARP_ACCEPT(in_dev)) { 895 if (IPV4_DEVCONF_ALL(dev_net(dev), ARP_ACCEPT)) {
880 /* Unsolicited ARP is not accepted by default. 896 /* Unsolicited ARP is not accepted by default.
881 It is possible, that this option should be enabled for some 897 It is possible, that this option should be enabled for some
882 devices (strip is candidate) 898 devices (strip is candidate)
@@ -1024,7 +1040,7 @@ static int arp_req_set(struct net *net, struct arpreq *r,
1024 return -EINVAL; 1040 return -EINVAL;
1025 } 1041 }
1026 switch (dev->type) { 1042 switch (dev->type) {
1027#if IS_ENABLED(CONFIG_FDDI) 1043#if defined(CONFIG_FDDI) || defined(CONFIG_FDDI_MODULE)
1028 case ARPHRD_FDDI: 1044 case ARPHRD_FDDI:
1029 /* 1045 /*
1030 * According to RFC 1390, FDDI devices should accept ARP 1046 * According to RFC 1390, FDDI devices should accept ARP
@@ -1047,7 +1063,7 @@ static int arp_req_set(struct net *net, struct arpreq *r,
1047 neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev); 1063 neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev);
1048 err = PTR_ERR(neigh); 1064 err = PTR_ERR(neigh);
1049 if (!IS_ERR(neigh)) { 1065 if (!IS_ERR(neigh)) {
1050 unsigned int state = NUD_STALE; 1066 unsigned state = NUD_STALE;
1051 if (r->arp_flags & ATF_PERM) 1067 if (r->arp_flags & ATF_PERM)
1052 state = NUD_PERMANENT; 1068 state = NUD_PERMANENT;
1053 err = neigh_update(neigh, (r->arp_flags & ATF_COM) ? 1069 err = neigh_update(neigh, (r->arp_flags & ATF_COM) ?
@@ -1059,7 +1075,7 @@ static int arp_req_set(struct net *net, struct arpreq *r,
1059 return err; 1075 return err;
1060} 1076}
1061 1077
1062static unsigned int arp_state_to_flags(struct neighbour *neigh) 1078static unsigned arp_state_to_flags(struct neighbour *neigh)
1063{ 1079{
1064 if (neigh->nud_state&NUD_PERMANENT) 1080 if (neigh->nud_state&NUD_PERMANENT)
1065 return ATF_PERM | ATF_COM; 1081 return ATF_PERM | ATF_COM;
@@ -1159,7 +1175,7 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1159 switch (cmd) { 1175 switch (cmd) {
1160 case SIOCDARP: 1176 case SIOCDARP:
1161 case SIOCSARP: 1177 case SIOCSARP:
1162 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 1178 if (!capable(CAP_NET_ADMIN))
1163 return -EPERM; 1179 return -EPERM;
1164 case SIOCGARP: 1180 case SIOCGARP:
1165 err = copy_from_user(&r, arg, sizeof(struct arpreq)); 1181 err = copy_from_user(&r, arg, sizeof(struct arpreq));
@@ -1223,7 +1239,7 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event,
1223 switch (event) { 1239 switch (event) {
1224 case NETDEV_CHANGEADDR: 1240 case NETDEV_CHANGEADDR:
1225 neigh_changeaddr(&arp_tbl, dev); 1241 neigh_changeaddr(&arp_tbl, dev);
1226 rt_cache_flush(dev_net(dev)); 1242 rt_cache_flush(dev_net(dev), 0);
1227 break; 1243 break;
1228 default: 1244 default:
1229 break; 1245 break;
@@ -1270,7 +1286,7 @@ void __init arp_init(void)
1270} 1286}
1271 1287
1272#ifdef CONFIG_PROC_FS 1288#ifdef CONFIG_PROC_FS
1273#if IS_ENABLED(CONFIG_AX25) 1289#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
1274 1290
1275/* ------------------------------------------------------------------------ */ 1291/* ------------------------------------------------------------------------ */
1276/* 1292/*
@@ -1318,7 +1334,7 @@ static void arp_format_neigh_entry(struct seq_file *seq,
1318 1334
1319 read_lock(&n->lock); 1335 read_lock(&n->lock);
1320 /* Convert hardware address to XX:XX:XX:XX ... form. */ 1336 /* Convert hardware address to XX:XX:XX:XX ... form. */
1321#if IS_ENABLED(CONFIG_AX25) 1337#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
1322 if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM) 1338 if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM)
1323 ax2asc2((ax25_address *)n->ha, hbuffer); 1339 ax2asc2((ax25_address *)n->ha, hbuffer);
1324 else { 1340 else {
@@ -1331,7 +1347,7 @@ static void arp_format_neigh_entry(struct seq_file *seq,
1331 if (k != 0) 1347 if (k != 0)
1332 --k; 1348 --k;
1333 hbuffer[k] = 0; 1349 hbuffer[k] = 0;
1334#if IS_ENABLED(CONFIG_AX25) 1350#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
1335 } 1351 }
1336#endif 1352#endif
1337 sprintf(tbuf, "%pI4", n->primary_key); 1353 sprintf(tbuf, "%pI4", n->primary_key);
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 667c1d4ca98..2c2a98e402e 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -476,7 +476,7 @@ int cipso_v4_doi_add(struct cipso_v4_doi *doi_def,
476 doi = doi_def->doi; 476 doi = doi_def->doi;
477 doi_type = doi_def->type; 477 doi_type = doi_def->type;
478 478
479 if (doi_def->doi == CIPSO_V4_DOI_UNKNOWN) 479 if (doi_def == NULL || doi_def->doi == CIPSO_V4_DOI_UNKNOWN)
480 goto doi_add_return; 480 goto doi_add_return;
481 for (iter = 0; iter < CIPSO_V4_TAG_MAXCNT; iter++) { 481 for (iter = 0; iter < CIPSO_V4_TAG_MAXCNT; iter++) {
482 switch (doi_def->tags[iter]) { 482 switch (doi_def->tags[iter]) {
@@ -1725,10 +1725,8 @@ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option)
1725 case CIPSO_V4_TAG_LOCAL: 1725 case CIPSO_V4_TAG_LOCAL:
1726 /* This is a non-standard tag that we only allow for 1726 /* This is a non-standard tag that we only allow for
1727 * local connections, so if the incoming interface is 1727 * local connections, so if the incoming interface is
1728 * not the loopback device drop the packet. Further, 1728 * not the loopback device drop the packet. */
1729 * there is no legitimate reason for setting this from 1729 if (!(skb->dev->flags & IFF_LOOPBACK)) {
1730 * userspace so reject it if skb is NULL. */
1731 if (skb == NULL || !(skb->dev->flags & IFF_LOOPBACK)) {
1732 err_offset = opt_iter; 1730 err_offset = opt_iter;
1733 goto validate_return_locked; 1731 goto validate_return_locked;
1734 } 1732 }
@@ -1859,6 +1857,11 @@ static int cipso_v4_genopt(unsigned char *buf, u32 buf_len,
1859 return CIPSO_V4_HDR_LEN + ret_val; 1857 return CIPSO_V4_HDR_LEN + ret_val;
1860} 1858}
1861 1859
1860static void opt_kfree_rcu(struct rcu_head *head)
1861{
1862 kfree(container_of(head, struct ip_options_rcu, rcu));
1863}
1864
1862/** 1865/**
1863 * cipso_v4_sock_setattr - Add a CIPSO option to a socket 1866 * cipso_v4_sock_setattr - Add a CIPSO option to a socket
1864 * @sk: the socket 1867 * @sk: the socket
@@ -1935,7 +1938,7 @@ int cipso_v4_sock_setattr(struct sock *sk,
1935 } 1938 }
1936 rcu_assign_pointer(sk_inet->inet_opt, opt); 1939 rcu_assign_pointer(sk_inet->inet_opt, opt);
1937 if (old) 1940 if (old)
1938 kfree_rcu(old, rcu); 1941 call_rcu(&old->rcu, opt_kfree_rcu);
1939 1942
1940 return 0; 1943 return 0;
1941 1944
@@ -2002,7 +2005,7 @@ int cipso_v4_req_setattr(struct request_sock *req,
2002 req_inet = inet_rsk(req); 2005 req_inet = inet_rsk(req);
2003 opt = xchg(&req_inet->opt, opt); 2006 opt = xchg(&req_inet->opt, opt);
2004 if (opt) 2007 if (opt)
2005 kfree_rcu(opt, rcu); 2008 call_rcu(&opt->rcu, opt_kfree_rcu);
2006 2009
2007 return 0; 2010 return 0;
2008 2011
@@ -2072,7 +2075,7 @@ static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr)
2072 * remove the entire option struct */ 2075 * remove the entire option struct */
2073 *opt_ptr = NULL; 2076 *opt_ptr = NULL;
2074 hdr_delta = opt->opt.optlen; 2077 hdr_delta = opt->opt.optlen;
2075 kfree_rcu(opt, rcu); 2078 call_rcu(&opt->rcu, opt_kfree_rcu);
2076 } 2079 }
2077 2080
2078 return hdr_delta; 2081 return hdr_delta;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index a8e4f2665d5..76db59202f1 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -27,6 +27,7 @@
27 27
28 28
29#include <asm/uaccess.h> 29#include <asm/uaccess.h>
30#include <asm/system.h>
30#include <linux/bitops.h> 31#include <linux/bitops.h>
31#include <linux/capability.h> 32#include <linux/capability.h>
32#include <linux/module.h> 33#include <linux/module.h>
@@ -55,10 +56,10 @@
55#include <linux/sysctl.h> 56#include <linux/sysctl.h>
56#endif 57#endif
57#include <linux/kmod.h> 58#include <linux/kmod.h>
58#include <linux/netconf.h>
59 59
60#include <net/arp.h> 60#include <net/arp.h>
61#include <net/ip.h> 61#include <net/ip.h>
62#include <net/tcp.h>
62#include <net/route.h> 63#include <net/route.h>
63#include <net/ip_fib.h> 64#include <net/ip_fib.h>
64#include <net/rtnetlink.h> 65#include <net/rtnetlink.h>
@@ -95,22 +96,25 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
95 [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, 96 [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
96}; 97};
97 98
98#define IN4_ADDR_HSIZE_SHIFT 8 99/* inet_addr_hash's shifting is dependent upon this IN4_ADDR_HSIZE
99#define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT) 100 * value. So if you change this define, make appropriate changes to
100 101 * inet_addr_hash as well.
102 */
103#define IN4_ADDR_HSIZE 256
101static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE]; 104static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
102static DEFINE_SPINLOCK(inet_addr_hash_lock); 105static DEFINE_SPINLOCK(inet_addr_hash_lock);
103 106
104static u32 inet_addr_hash(struct net *net, __be32 addr) 107static inline unsigned int inet_addr_hash(struct net *net, __be32 addr)
105{ 108{
106 u32 val = (__force u32) addr ^ net_hash_mix(net); 109 u32 val = (__force u32) addr ^ hash_ptr(net, 8);
107 110
108 return hash_32(val, IN4_ADDR_HSIZE_SHIFT); 111 return ((val ^ (val >> 8) ^ (val >> 16) ^ (val >> 24)) &
112 (IN4_ADDR_HSIZE - 1));
109} 113}
110 114
111static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa) 115static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
112{ 116{
113 u32 hash = inet_addr_hash(net, ifa->ifa_local); 117 unsigned int hash = inet_addr_hash(net, ifa->ifa_local);
114 118
115 spin_lock(&inet_addr_hash_lock); 119 spin_lock(&inet_addr_hash_lock);
116 hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]); 120 hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]);
@@ -134,18 +138,18 @@ static void inet_hash_remove(struct in_ifaddr *ifa)
134 */ 138 */
135struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) 139struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
136{ 140{
137 u32 hash = inet_addr_hash(net, addr); 141 unsigned int hash = inet_addr_hash(net, addr);
138 struct net_device *result = NULL; 142 struct net_device *result = NULL;
139 struct in_ifaddr *ifa; 143 struct in_ifaddr *ifa;
140 struct hlist_node *node; 144 struct hlist_node *node;
141 145
142 rcu_read_lock(); 146 rcu_read_lock();
143 hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) { 147 hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) {
144 if (ifa->ifa_local == addr) { 148 struct net_device *dev = ifa->ifa_dev->dev;
145 struct net_device *dev = ifa->ifa_dev->dev;
146 149
147 if (!net_eq(dev_net(dev), net)) 150 if (!net_eq(dev_net(dev), net))
148 continue; 151 continue;
152 if (ifa->ifa_local == addr) {
149 result = dev; 153 result = dev;
150 break; 154 break;
151 } 155 }
@@ -180,10 +184,10 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
180static void devinet_sysctl_register(struct in_device *idev); 184static void devinet_sysctl_register(struct in_device *idev);
181static void devinet_sysctl_unregister(struct in_device *idev); 185static void devinet_sysctl_unregister(struct in_device *idev);
182#else 186#else
183static void devinet_sysctl_register(struct in_device *idev) 187static inline void devinet_sysctl_register(struct in_device *idev)
184{ 188{
185} 189}
186static void devinet_sysctl_unregister(struct in_device *idev) 190static inline void devinet_sysctl_unregister(struct in_device *idev)
187{ 191{
188} 192}
189#endif 193#endif
@@ -203,7 +207,7 @@ static void inet_rcu_free_ifa(struct rcu_head *head)
203 kfree(ifa); 207 kfree(ifa);
204} 208}
205 209
206static void inet_free_ifa(struct in_ifaddr *ifa) 210static inline void inet_free_ifa(struct in_ifaddr *ifa)
207{ 211{
208 call_rcu(&ifa->rcu_head, inet_rcu_free_ifa); 212 call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
209} 213}
@@ -215,7 +219,8 @@ void in_dev_finish_destroy(struct in_device *idev)
215 WARN_ON(idev->ifa_list); 219 WARN_ON(idev->ifa_list);
216 WARN_ON(idev->mc_list); 220 WARN_ON(idev->mc_list);
217#ifdef NET_REFCNT_DEBUG 221#ifdef NET_REFCNT_DEBUG
218 pr_debug("%s: %p=%s\n", __func__, idev, dev ? dev->name : "NIL"); 222 printk(KERN_DEBUG "in_dev_finish_destroy: %p=%s\n",
223 idev, dev ? dev->name : "NIL");
219#endif 224#endif
220 dev_put(dev); 225 dev_put(dev);
221 if (!idev->dead) 226 if (!idev->dead)
@@ -287,7 +292,7 @@ static void inetdev_destroy(struct in_device *in_dev)
287 inet_free_ifa(ifa); 292 inet_free_ifa(ifa);
288 } 293 }
289 294
290 RCU_INIT_POINTER(dev->ip_ptr, NULL); 295 rcu_assign_pointer(dev->ip_ptr, NULL);
291 296
292 devinet_sysctl_unregister(in_dev); 297 devinet_sysctl_unregister(in_dev);
293 neigh_parms_release(&arp_tbl, in_dev->arp_parms); 298 neigh_parms_release(&arp_tbl, in_dev->arp_parms);
@@ -312,7 +317,7 @@ int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b)
312} 317}
313 318
314static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, 319static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
315 int destroy, struct nlmsghdr *nlh, u32 portid) 320 int destroy, struct nlmsghdr *nlh, u32 pid)
316{ 321{
317 struct in_ifaddr *promote = NULL; 322 struct in_ifaddr *promote = NULL;
318 struct in_ifaddr *ifa, *ifa1 = *ifap; 323 struct in_ifaddr *ifa, *ifa1 = *ifap;
@@ -346,7 +351,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
346 inet_hash_remove(ifa); 351 inet_hash_remove(ifa);
347 *ifap1 = ifa->ifa_next; 352 *ifap1 = ifa->ifa_next;
348 353
349 rtmsg_ifa(RTM_DELADDR, ifa, nlh, portid); 354 rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid);
350 blocking_notifier_call_chain(&inetaddr_chain, 355 blocking_notifier_call_chain(&inetaddr_chain,
351 NETDEV_DOWN, ifa); 356 NETDEV_DOWN, ifa);
352 inet_free_ifa(ifa); 357 inet_free_ifa(ifa);
@@ -383,7 +388,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
383 is valid, it will try to restore deleted routes... Grr. 388 is valid, it will try to restore deleted routes... Grr.
384 So that, this order is correct. 389 So that, this order is correct.
385 */ 390 */
386 rtmsg_ifa(RTM_DELADDR, ifa1, nlh, portid); 391 rtmsg_ifa(RTM_DELADDR, ifa1, nlh, pid);
387 blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); 392 blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
388 393
389 if (promote) { 394 if (promote) {
@@ -396,7 +401,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
396 } 401 }
397 402
398 promote->ifa_flags &= ~IFA_F_SECONDARY; 403 promote->ifa_flags &= ~IFA_F_SECONDARY;
399 rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid); 404 rtmsg_ifa(RTM_NEWADDR, promote, nlh, pid);
400 blocking_notifier_call_chain(&inetaddr_chain, 405 blocking_notifier_call_chain(&inetaddr_chain,
401 NETDEV_UP, promote); 406 NETDEV_UP, promote);
402 for (ifa = next_sec; ifa; ifa = ifa->ifa_next) { 407 for (ifa = next_sec; ifa; ifa = ifa->ifa_next) {
@@ -418,7 +423,7 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
418} 423}
419 424
420static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, 425static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
421 u32 portid) 426 u32 pid)
422{ 427{
423 struct in_device *in_dev = ifa->ifa_dev; 428 struct in_device *in_dev = ifa->ifa_dev;
424 struct in_ifaddr *ifa1, **ifap, **last_primary; 429 struct in_ifaddr *ifa1, **ifap, **last_primary;
@@ -465,7 +470,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
465 /* Send message first, then call notifier. 470 /* Send message first, then call notifier.
466 Notifier will trigger FIB update, so that 471 Notifier will trigger FIB update, so that
467 listeners of netlink will know about new ifaddr */ 472 listeners of netlink will know about new ifaddr */
468 rtmsg_ifa(RTM_NEWADDR, ifa, nlh, portid); 473 rtmsg_ifa(RTM_NEWADDR, ifa, nlh, pid);
469 blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); 474 blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
470 475
471 return 0; 476 return 0;
@@ -564,7 +569,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
564 !inet_ifa_match(nla_get_be32(tb[IFA_ADDRESS]), ifa))) 569 !inet_ifa_match(nla_get_be32(tb[IFA_ADDRESS]), ifa)))
565 continue; 570 continue;
566 571
567 __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid); 572 __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).pid);
568 return 0; 573 return 0;
569 } 574 }
570 575
@@ -650,14 +655,14 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
650 if (IS_ERR(ifa)) 655 if (IS_ERR(ifa))
651 return PTR_ERR(ifa); 656 return PTR_ERR(ifa);
652 657
653 return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid); 658 return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).pid);
654} 659}
655 660
656/* 661/*
657 * Determine a default network mask, based on the IP address. 662 * Determine a default network mask, based on the IP address.
658 */ 663 */
659 664
660static int inet_abc_len(__be32 addr) 665static inline int inet_abc_len(__be32 addr)
661{ 666{
662 int rc = -1; /* Something else, probably a multicast. */ 667 int rc = -1; /* Something else, probably a multicast. */
663 668
@@ -723,16 +728,17 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
723 break; 728 break;
724 729
725 case SIOCSIFFLAGS: 730 case SIOCSIFFLAGS:
726 ret = -EPERM; 731 ret = -EACCES;
727 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 732 if (!capable(CAP_NET_ADMIN))
728 goto out; 733 goto out;
729 break; 734 break;
730 case SIOCSIFADDR: /* Set interface address (and family) */ 735 case SIOCSIFADDR: /* Set interface address (and family) */
731 case SIOCSIFBRDADDR: /* Set the broadcast address */ 736 case SIOCSIFBRDADDR: /* Set the broadcast address */
732 case SIOCSIFDSTADDR: /* Set the destination address */ 737 case SIOCSIFDSTADDR: /* Set the destination address */
733 case SIOCSIFNETMASK: /* Set the netmask for the interface */ 738 case SIOCSIFNETMASK: /* Set the netmask for the interface */
734 ret = -EPERM; 739 case SIOCKILLADDR: /* Nuke all sockets on this address */
735 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 740 ret = -EACCES;
741 if (!capable(CAP_NET_ADMIN))
736 goto out; 742 goto out;
737 ret = -EINVAL; 743 ret = -EINVAL;
738 if (sin->sin_family != AF_INET) 744 if (sin->sin_family != AF_INET)
@@ -782,7 +788,8 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
782 } 788 }
783 789
784 ret = -EADDRNOTAVAIL; 790 ret = -EADDRNOTAVAIL;
785 if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS) 791 if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS
792 && cmd != SIOCKILLADDR)
786 goto done; 793 goto done;
787 794
788 switch (cmd) { 795 switch (cmd) {
@@ -823,9 +830,9 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
823 if (!ifa) { 830 if (!ifa) {
824 ret = -ENOBUFS; 831 ret = -ENOBUFS;
825 ifa = inet_alloc_ifa(); 832 ifa = inet_alloc_ifa();
833 INIT_HLIST_NODE(&ifa->hash);
826 if (!ifa) 834 if (!ifa)
827 break; 835 break;
828 INIT_HLIST_NODE(&ifa->hash);
829 if (colon) 836 if (colon)
830 memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ); 837 memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ);
831 else 838 else
@@ -908,6 +915,9 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
908 inet_insert_ifa(ifa); 915 inet_insert_ifa(ifa);
909 } 916 }
910 break; 917 break;
918 case SIOCKILLADDR: /* Nuke all connections on this address */
919 ret = tcp_nuke_addr(net, (struct sockaddr *) sin);
920 break;
911 } 921 }
912done: 922done:
913 rtnl_unlock(); 923 rtnl_unlock();
@@ -1075,7 +1085,6 @@ __be32 inet_confirm_addr(struct in_device *in_dev,
1075 1085
1076 return addr; 1086 return addr;
1077} 1087}
1078EXPORT_SYMBOL(inet_confirm_addr);
1079 1088
1080/* 1089/*
1081 * Device notifier 1090 * Device notifier
@@ -1122,7 +1131,7 @@ skip:
1122 } 1131 }
1123} 1132}
1124 1133
1125static bool inetdev_valid_mtu(unsigned int mtu) 1134static inline bool inetdev_valid_mtu(unsigned mtu)
1126{ 1135{
1127 return mtu >= 68; 1136 return mtu >= 68;
1128} 1137}
@@ -1171,8 +1180,8 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
1171 1180
1172 switch (event) { 1181 switch (event) {
1173 case NETDEV_REGISTER: 1182 case NETDEV_REGISTER:
1174 pr_debug("%s: bug\n", __func__); 1183 printk(KERN_DEBUG "inetdev_event: bug\n");
1175 RCU_INIT_POINTER(dev->ip_ptr, NULL); 1184 rcu_assign_pointer(dev->ip_ptr, NULL);
1176 break; 1185 break;
1177 case NETDEV_UP: 1186 case NETDEV_UP:
1178 if (!inetdev_valid_mtu(dev->mtu)) 1187 if (!inetdev_valid_mtu(dev->mtu))
@@ -1237,7 +1246,7 @@ static struct notifier_block ip_netdev_notifier = {
1237 .notifier_call = inetdev_event, 1246 .notifier_call = inetdev_event,
1238}; 1247};
1239 1248
1240static size_t inet_nlmsg_size(void) 1249static inline size_t inet_nlmsg_size(void)
1241{ 1250{
1242 return NLMSG_ALIGN(sizeof(struct ifaddrmsg)) 1251 return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
1243 + nla_total_size(4) /* IFA_ADDRESS */ 1252 + nla_total_size(4) /* IFA_ADDRESS */
@@ -1247,12 +1256,12 @@ static size_t inet_nlmsg_size(void)
1247} 1256}
1248 1257
1249static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, 1258static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
1250 u32 portid, u32 seq, int event, unsigned int flags) 1259 u32 pid, u32 seq, int event, unsigned int flags)
1251{ 1260{
1252 struct ifaddrmsg *ifm; 1261 struct ifaddrmsg *ifm;
1253 struct nlmsghdr *nlh; 1262 struct nlmsghdr *nlh;
1254 1263
1255 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), flags); 1264 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*ifm), flags);
1256 if (nlh == NULL) 1265 if (nlh == NULL)
1257 return -EMSGSIZE; 1266 return -EMSGSIZE;
1258 1267
@@ -1263,15 +1272,17 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
1263 ifm->ifa_scope = ifa->ifa_scope; 1272 ifm->ifa_scope = ifa->ifa_scope;
1264 ifm->ifa_index = ifa->ifa_dev->dev->ifindex; 1273 ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
1265 1274
1266 if ((ifa->ifa_address && 1275 if (ifa->ifa_address)
1267 nla_put_be32(skb, IFA_ADDRESS, ifa->ifa_address)) || 1276 NLA_PUT_BE32(skb, IFA_ADDRESS, ifa->ifa_address);
1268 (ifa->ifa_local && 1277
1269 nla_put_be32(skb, IFA_LOCAL, ifa->ifa_local)) || 1278 if (ifa->ifa_local)
1270 (ifa->ifa_broadcast && 1279 NLA_PUT_BE32(skb, IFA_LOCAL, ifa->ifa_local);
1271 nla_put_be32(skb, IFA_BROADCAST, ifa->ifa_broadcast)) || 1280
1272 (ifa->ifa_label[0] && 1281 if (ifa->ifa_broadcast)
1273 nla_put_string(skb, IFA_LABEL, ifa->ifa_label))) 1282 NLA_PUT_BE32(skb, IFA_BROADCAST, ifa->ifa_broadcast);
1274 goto nla_put_failure; 1283
1284 if (ifa->ifa_label[0])
1285 NLA_PUT_STRING(skb, IFA_LABEL, ifa->ifa_label);
1275 1286
1276 return nlmsg_end(skb, nlh); 1287 return nlmsg_end(skb, nlh);
1277 1288
@@ -1314,7 +1325,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
1314 if (ip_idx < s_ip_idx) 1325 if (ip_idx < s_ip_idx)
1315 continue; 1326 continue;
1316 if (inet_fill_ifaddr(skb, ifa, 1327 if (inet_fill_ifaddr(skb, ifa,
1317 NETLINK_CB(cb->skb).portid, 1328 NETLINK_CB(cb->skb).pid,
1318 cb->nlh->nlmsg_seq, 1329 cb->nlh->nlmsg_seq,
1319 RTM_NEWADDR, NLM_F_MULTI) <= 0) { 1330 RTM_NEWADDR, NLM_F_MULTI) <= 0) {
1320 rcu_read_unlock(); 1331 rcu_read_unlock();
@@ -1336,7 +1347,7 @@ done:
1336} 1347}
1337 1348
1338static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, 1349static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
1339 u32 portid) 1350 u32 pid)
1340{ 1351{
1341 struct sk_buff *skb; 1352 struct sk_buff *skb;
1342 u32 seq = nlh ? nlh->nlmsg_seq : 0; 1353 u32 seq = nlh ? nlh->nlmsg_seq : 0;
@@ -1348,14 +1359,14 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
1348 if (skb == NULL) 1359 if (skb == NULL)
1349 goto errout; 1360 goto errout;
1350 1361
1351 err = inet_fill_ifaddr(skb, ifa, portid, seq, event, 0); 1362 err = inet_fill_ifaddr(skb, ifa, pid, seq, event, 0);
1352 if (err < 0) { 1363 if (err < 0) {
1353 /* -EMSGSIZE implies BUG in inet_nlmsg_size() */ 1364 /* -EMSGSIZE implies BUG in inet_nlmsg_size() */
1354 WARN_ON(err == -EMSGSIZE); 1365 WARN_ON(err == -EMSGSIZE);
1355 kfree_skb(skb); 1366 kfree_skb(skb);
1356 goto errout; 1367 goto errout;
1357 } 1368 }
1358 rtnl_notify(skb, net, portid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL); 1369 rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
1359 return; 1370 return;
1360errout: 1371errout:
1361 if (err < 0) 1372 if (err < 0)
@@ -1443,155 +1454,6 @@ static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla)
1443 return 0; 1454 return 0;
1444} 1455}
1445 1456
1446static int inet_netconf_msgsize_devconf(int type)
1447{
1448 int size = NLMSG_ALIGN(sizeof(struct netconfmsg))
1449 + nla_total_size(4); /* NETCONFA_IFINDEX */
1450
1451 /* type -1 is used for ALL */
1452 if (type == -1 || type == NETCONFA_FORWARDING)
1453 size += nla_total_size(4);
1454 if (type == -1 || type == NETCONFA_RP_FILTER)
1455 size += nla_total_size(4);
1456 if (type == -1 || type == NETCONFA_MC_FORWARDING)
1457 size += nla_total_size(4);
1458
1459 return size;
1460}
1461
1462static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
1463 struct ipv4_devconf *devconf, u32 portid,
1464 u32 seq, int event, unsigned int flags,
1465 int type)
1466{
1467 struct nlmsghdr *nlh;
1468 struct netconfmsg *ncm;
1469
1470 nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
1471 flags);
1472 if (nlh == NULL)
1473 return -EMSGSIZE;
1474
1475 ncm = nlmsg_data(nlh);
1476 ncm->ncm_family = AF_INET;
1477
1478 if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0)
1479 goto nla_put_failure;
1480
1481 /* type -1 is used for ALL */
1482 if ((type == -1 || type == NETCONFA_FORWARDING) &&
1483 nla_put_s32(skb, NETCONFA_FORWARDING,
1484 IPV4_DEVCONF(*devconf, FORWARDING)) < 0)
1485 goto nla_put_failure;
1486 if ((type == -1 || type == NETCONFA_RP_FILTER) &&
1487 nla_put_s32(skb, NETCONFA_RP_FILTER,
1488 IPV4_DEVCONF(*devconf, RP_FILTER)) < 0)
1489 goto nla_put_failure;
1490 if ((type == -1 || type == NETCONFA_MC_FORWARDING) &&
1491 nla_put_s32(skb, NETCONFA_MC_FORWARDING,
1492 IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
1493 goto nla_put_failure;
1494
1495 return nlmsg_end(skb, nlh);
1496
1497nla_put_failure:
1498 nlmsg_cancel(skb, nlh);
1499 return -EMSGSIZE;
1500}
1501
1502void inet_netconf_notify_devconf(struct net *net, int type, int ifindex,
1503 struct ipv4_devconf *devconf)
1504{
1505 struct sk_buff *skb;
1506 int err = -ENOBUFS;
1507
1508 skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_ATOMIC);
1509 if (skb == NULL)
1510 goto errout;
1511
1512 err = inet_netconf_fill_devconf(skb, ifindex, devconf, 0, 0,
1513 RTM_NEWNETCONF, 0, type);
1514 if (err < 0) {
1515 /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */
1516 WARN_ON(err == -EMSGSIZE);
1517 kfree_skb(skb);
1518 goto errout;
1519 }
1520 rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_ATOMIC);
1521 return;
1522errout:
1523 if (err < 0)
1524 rtnl_set_sk_err(net, RTNLGRP_IPV4_NETCONF, err);
1525}
1526
1527static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = {
1528 [NETCONFA_IFINDEX] = { .len = sizeof(int) },
1529 [NETCONFA_FORWARDING] = { .len = sizeof(int) },
1530 [NETCONFA_RP_FILTER] = { .len = sizeof(int) },
1531};
1532
1533static int inet_netconf_get_devconf(struct sk_buff *in_skb,
1534 struct nlmsghdr *nlh,
1535 void *arg)
1536{
1537 struct net *net = sock_net(in_skb->sk);
1538 struct nlattr *tb[NETCONFA_MAX+1];
1539 struct netconfmsg *ncm;
1540 struct sk_buff *skb;
1541 struct ipv4_devconf *devconf;
1542 struct in_device *in_dev;
1543 struct net_device *dev;
1544 int ifindex;
1545 int err;
1546
1547 err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX,
1548 devconf_ipv4_policy);
1549 if (err < 0)
1550 goto errout;
1551
1552 err = EINVAL;
1553 if (!tb[NETCONFA_IFINDEX])
1554 goto errout;
1555
1556 ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]);
1557 switch (ifindex) {
1558 case NETCONFA_IFINDEX_ALL:
1559 devconf = net->ipv4.devconf_all;
1560 break;
1561 case NETCONFA_IFINDEX_DEFAULT:
1562 devconf = net->ipv4.devconf_dflt;
1563 break;
1564 default:
1565 dev = __dev_get_by_index(net, ifindex);
1566 if (dev == NULL)
1567 goto errout;
1568 in_dev = __in_dev_get_rtnl(dev);
1569 if (in_dev == NULL)
1570 goto errout;
1571 devconf = &in_dev->cnf;
1572 break;
1573 }
1574
1575 err = -ENOBUFS;
1576 skb = nlmsg_new(inet_netconf_msgsize_devconf(-1), GFP_ATOMIC);
1577 if (skb == NULL)
1578 goto errout;
1579
1580 err = inet_netconf_fill_devconf(skb, ifindex, devconf,
1581 NETLINK_CB(in_skb).portid,
1582 nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
1583 -1);
1584 if (err < 0) {
1585 /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */
1586 WARN_ON(err == -EMSGSIZE);
1587 kfree_skb(skb);
1588 goto errout;
1589 }
1590 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
1591errout:
1592 return err;
1593}
1594
1595#ifdef CONFIG_SYSCTL 1457#ifdef CONFIG_SYSCTL
1596 1458
1597static void devinet_copy_dflt_conf(struct net *net, int i) 1459static void devinet_copy_dflt_conf(struct net *net, int i)
@@ -1617,12 +1479,6 @@ static void inet_forward_change(struct net *net)
1617 1479
1618 IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on; 1480 IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on;
1619 IPV4_DEVCONF_DFLT(net, FORWARDING) = on; 1481 IPV4_DEVCONF_DFLT(net, FORWARDING) = on;
1620 inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
1621 NETCONFA_IFINDEX_ALL,
1622 net->ipv4.devconf_all);
1623 inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
1624 NETCONFA_IFINDEX_DEFAULT,
1625 net->ipv4.devconf_dflt);
1626 1482
1627 for_each_netdev(net, dev) { 1483 for_each_netdev(net, dev) {
1628 struct in_device *in_dev; 1484 struct in_device *in_dev;
@@ -1630,11 +1486,8 @@ static void inet_forward_change(struct net *net)
1630 dev_disable_lro(dev); 1486 dev_disable_lro(dev);
1631 rcu_read_lock(); 1487 rcu_read_lock();
1632 in_dev = __in_dev_get_rcu(dev); 1488 in_dev = __in_dev_get_rcu(dev);
1633 if (in_dev) { 1489 if (in_dev)
1634 IN_DEV_CONF_SET(in_dev, FORWARDING, on); 1490 IN_DEV_CONF_SET(in_dev, FORWARDING, on);
1635 inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
1636 dev->ifindex, &in_dev->cnf);
1637 }
1638 rcu_read_unlock(); 1491 rcu_read_unlock();
1639 } 1492 }
1640} 1493}
@@ -1656,27 +1509,9 @@ static int devinet_conf_proc(ctl_table *ctl, int write,
1656 1509
1657 if (cnf == net->ipv4.devconf_dflt) 1510 if (cnf == net->ipv4.devconf_dflt)
1658 devinet_copy_dflt_conf(net, i); 1511 devinet_copy_dflt_conf(net, i);
1659 if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 || 1512 if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1)
1660 i == IPV4_DEVCONF_ROUTE_LOCALNET - 1)
1661 if ((new_value == 0) && (old_value != 0)) 1513 if ((new_value == 0) && (old_value != 0))
1662 rt_cache_flush(net); 1514 rt_cache_flush(net, 0);
1663 if (i == IPV4_DEVCONF_RP_FILTER - 1 &&
1664 new_value != old_value) {
1665 int ifindex;
1666
1667 if (cnf == net->ipv4.devconf_dflt)
1668 ifindex = NETCONFA_IFINDEX_DEFAULT;
1669 else if (cnf == net->ipv4.devconf_all)
1670 ifindex = NETCONFA_IFINDEX_ALL;
1671 else {
1672 struct in_device *idev =
1673 container_of(cnf, struct in_device,
1674 cnf);
1675 ifindex = idev->dev->ifindex;
1676 }
1677 inet_netconf_notify_devconf(net, NETCONFA_RP_FILTER,
1678 ifindex, cnf);
1679 }
1680 } 1515 }
1681 1516
1682 return ret; 1517 return ret;
@@ -1703,23 +1538,15 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write,
1703 } 1538 }
1704 if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) { 1539 if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) {
1705 inet_forward_change(net); 1540 inet_forward_change(net);
1706 } else { 1541 } else if (*valp) {
1707 struct ipv4_devconf *cnf = ctl->extra1; 1542 struct ipv4_devconf *cnf = ctl->extra1;
1708 struct in_device *idev = 1543 struct in_device *idev =
1709 container_of(cnf, struct in_device, cnf); 1544 container_of(cnf, struct in_device, cnf);
1710 if (*valp) 1545 dev_disable_lro(idev->dev);
1711 dev_disable_lro(idev->dev);
1712 inet_netconf_notify_devconf(net,
1713 NETCONFA_FORWARDING,
1714 idev->dev->ifindex,
1715 cnf);
1716 } 1546 }
1717 rtnl_unlock(); 1547 rtnl_unlock();
1718 rt_cache_flush(net); 1548 rt_cache_flush(net, 0);
1719 } else 1549 }
1720 inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
1721 NETCONFA_IFINDEX_DEFAULT,
1722 net->ipv4.devconf_dflt);
1723 } 1550 }
1724 1551
1725 return ret; 1552 return ret;
@@ -1735,7 +1562,7 @@ static int ipv4_doint_and_flush(ctl_table *ctl, int write,
1735 struct net *net = ctl->extra2; 1562 struct net *net = ctl->extra2;
1736 1563
1737 if (write && *valp != val) 1564 if (write && *valp != val)
1738 rt_cache_flush(net); 1565 rt_cache_flush(net, 0);
1739 1566
1740 return ret; 1567 return ret;
1741} 1568}
@@ -1766,6 +1593,7 @@ static int ipv4_doint_and_flush(ctl_table *ctl, int write,
1766static struct devinet_sysctl_table { 1593static struct devinet_sysctl_table {
1767 struct ctl_table_header *sysctl_header; 1594 struct ctl_table_header *sysctl_header;
1768 struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX]; 1595 struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX];
1596 char *dev_name;
1769} devinet_sysctl = { 1597} devinet_sysctl = {
1770 .devinet_vars = { 1598 .devinet_vars = {
1771 DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding", 1599 DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
@@ -1799,8 +1627,6 @@ static struct devinet_sysctl_table {
1799 "force_igmp_version"), 1627 "force_igmp_version"),
1800 DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES, 1628 DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES,
1801 "promote_secondaries"), 1629 "promote_secondaries"),
1802 DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET,
1803 "route_localnet"),
1804 }, 1630 },
1805}; 1631};
1806 1632
@@ -1809,7 +1635,16 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
1809{ 1635{
1810 int i; 1636 int i;
1811 struct devinet_sysctl_table *t; 1637 struct devinet_sysctl_table *t;
1812 char path[sizeof("net/ipv4/conf/") + IFNAMSIZ]; 1638
1639#define DEVINET_CTL_PATH_DEV 3
1640
1641 struct ctl_path devinet_ctl_path[] = {
1642 { .procname = "net", },
1643 { .procname = "ipv4", },
1644 { .procname = "conf", },
1645 { /* to be set */ },
1646 { },
1647 };
1813 1648
1814 t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL); 1649 t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL);
1815 if (!t) 1650 if (!t)
@@ -1821,15 +1656,27 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
1821 t->devinet_vars[i].extra2 = net; 1656 t->devinet_vars[i].extra2 = net;
1822 } 1657 }
1823 1658
1824 snprintf(path, sizeof(path), "net/ipv4/conf/%s", dev_name); 1659 /*
1660 * Make a copy of dev_name, because '.procname' is regarded as const
1661 * by sysctl and we wouldn't want anyone to change it under our feet
1662 * (see SIOCSIFNAME).
1663 */
1664 t->dev_name = kstrdup(dev_name, GFP_KERNEL);
1665 if (!t->dev_name)
1666 goto free;
1667
1668 devinet_ctl_path[DEVINET_CTL_PATH_DEV].procname = t->dev_name;
1825 1669
1826 t->sysctl_header = register_net_sysctl(net, path, t->devinet_vars); 1670 t->sysctl_header = register_net_sysctl_table(net, devinet_ctl_path,
1671 t->devinet_vars);
1827 if (!t->sysctl_header) 1672 if (!t->sysctl_header)
1828 goto free; 1673 goto free_procname;
1829 1674
1830 p->sysctl = t; 1675 p->sysctl = t;
1831 return 0; 1676 return 0;
1832 1677
1678free_procname:
1679 kfree(t->dev_name);
1833free: 1680free:
1834 kfree(t); 1681 kfree(t);
1835out: 1682out:
@@ -1845,6 +1692,7 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
1845 1692
1846 cnf->sysctl = NULL; 1693 cnf->sysctl = NULL;
1847 unregister_net_sysctl_table(t->sysctl_header); 1694 unregister_net_sysctl_table(t->sysctl_header);
1695 kfree(t->dev_name);
1848 kfree(t); 1696 kfree(t);
1849} 1697}
1850 1698
@@ -1874,6 +1722,12 @@ static struct ctl_table ctl_forward_entry[] = {
1874 }, 1722 },
1875 { }, 1723 { },
1876}; 1724};
1725
1726static __net_initdata struct ctl_path net_ipv4_path[] = {
1727 { .procname = "net", },
1728 { .procname = "ipv4", },
1729 { },
1730};
1877#endif 1731#endif
1878 1732
1879static __net_init int devinet_init_net(struct net *net) 1733static __net_init int devinet_init_net(struct net *net)
@@ -1919,7 +1773,7 @@ static __net_init int devinet_init_net(struct net *net)
1919 goto err_reg_dflt; 1773 goto err_reg_dflt;
1920 1774
1921 err = -ENOMEM; 1775 err = -ENOMEM;
1922 forw_hdr = register_net_sysctl(net, "net/ipv4", tbl); 1776 forw_hdr = register_net_sysctl_table(net, net_ipv4_path, tbl);
1923 if (forw_hdr == NULL) 1777 if (forw_hdr == NULL)
1924 goto err_reg_ctl; 1778 goto err_reg_ctl;
1925 net->ipv4.forw_hdr = forw_hdr; 1779 net->ipv4.forw_hdr = forw_hdr;
@@ -1993,7 +1847,5 @@ void __init devinet_init(void)
1993 rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, NULL); 1847 rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, NULL);
1994 rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL); 1848 rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL);
1995 rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL); 1849 rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL);
1996 rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf,
1997 NULL, NULL);
1998} 1850}
1999 1851
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index b61e9deb7c7..a5b413416da 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -1,5 +1,3 @@
1#define pr_fmt(fmt) "IPsec: " fmt
2
3#include <crypto/aead.h> 1#include <crypto/aead.h>
4#include <crypto/authenc.h> 2#include <crypto/authenc.h>
5#include <linux/err.h> 3#include <linux/err.h>
@@ -459,22 +457,28 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
459 struct esp_data *esp = x->data; 457 struct esp_data *esp = x->data;
460 u32 blksize = ALIGN(crypto_aead_blocksize(esp->aead), 4); 458 u32 blksize = ALIGN(crypto_aead_blocksize(esp->aead), 4);
461 u32 align = max_t(u32, blksize, esp->padlen); 459 u32 align = max_t(u32, blksize, esp->padlen);
462 unsigned int net_adj; 460 u32 rem;
461
462 mtu -= x->props.header_len + crypto_aead_authsize(esp->aead);
463 rem = mtu & (align - 1);
464 mtu &= ~(align - 1);
463 465
464 switch (x->props.mode) { 466 switch (x->props.mode) {
465 case XFRM_MODE_TRANSPORT:
466 case XFRM_MODE_BEET:
467 net_adj = sizeof(struct iphdr);
468 break;
469 case XFRM_MODE_TUNNEL: 467 case XFRM_MODE_TUNNEL:
470 net_adj = 0;
471 break; 468 break;
472 default: 469 default:
473 BUG(); 470 case XFRM_MODE_TRANSPORT:
471 /* The worst case */
472 mtu -= blksize - 4;
473 mtu += min_t(u32, blksize - 4, rem);
474 break;
475 case XFRM_MODE_BEET:
476 /* The worst case. */
477 mtu += min_t(u32, IPV4_BEET_PHMAXLEN, rem);
478 break;
474 } 479 }
475 480
476 return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) - 481 return mtu - 2;
477 net_adj) & ~(align - 1)) + (net_adj - 2);
478} 482}
479 483
480static void esp4_err(struct sk_buff *skb, u32 info) 484static void esp4_err(struct sk_buff *skb, u32 info)
@@ -484,25 +488,16 @@ static void esp4_err(struct sk_buff *skb, u32 info)
484 struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); 488 struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
485 struct xfrm_state *x; 489 struct xfrm_state *x;
486 490
487 switch (icmp_hdr(skb)->type) { 491 if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH ||
488 case ICMP_DEST_UNREACH: 492 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
489 if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
490 return;
491 case ICMP_REDIRECT:
492 break;
493 default:
494 return; 493 return;
495 }
496 494
497 x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, 495 x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
498 esph->spi, IPPROTO_ESP, AF_INET); 496 esph->spi, IPPROTO_ESP, AF_INET);
499 if (!x) 497 if (!x)
500 return; 498 return;
501 499 NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
502 if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) 500 ntohl(esph->spi), ntohl(iph->daddr));
503 ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0);
504 else
505 ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0);
506 xfrm_state_put(x); 501 xfrm_state_put(x);
507} 502}
508 503
@@ -711,11 +706,11 @@ static const struct net_protocol esp4_protocol = {
711static int __init esp4_init(void) 706static int __init esp4_init(void)
712{ 707{
713 if (xfrm_register_type(&esp_type, AF_INET) < 0) { 708 if (xfrm_register_type(&esp_type, AF_INET) < 0) {
714 pr_info("%s: can't add xfrm type\n", __func__); 709 printk(KERN_INFO "ip esp init: can't add xfrm type\n");
715 return -EAGAIN; 710 return -EAGAIN;
716 } 711 }
717 if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) { 712 if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) {
718 pr_info("%s: can't add protocol\n", __func__); 713 printk(KERN_INFO "ip esp init: can't add protocol\n");
719 xfrm_unregister_type(&esp_type, AF_INET); 714 xfrm_unregister_type(&esp_type, AF_INET);
720 return -EAGAIN; 715 return -EAGAIN;
721 } 716 }
@@ -725,9 +720,9 @@ static int __init esp4_init(void)
725static void __exit esp4_fini(void) 720static void __exit esp4_fini(void)
726{ 721{
727 if (inet_del_protocol(&esp4_protocol, IPPROTO_ESP) < 0) 722 if (inet_del_protocol(&esp4_protocol, IPPROTO_ESP) < 0)
728 pr_info("%s: can't remove protocol\n", __func__); 723 printk(KERN_INFO "ip esp close: can't remove protocol\n");
729 if (xfrm_unregister_type(&esp_type, AF_INET) < 0) 724 if (xfrm_unregister_type(&esp_type, AF_INET) < 0)
730 pr_info("%s: can't remove xfrm type\n", __func__); 725 printk(KERN_INFO "ip esp close: can't remove xfrm type\n");
731} 726}
732 727
733module_init(esp4_init); 728module_init(esp4_init);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 5cd75e2dab2..92fc5f69f5d 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -15,6 +15,7 @@
15 15
16#include <linux/module.h> 16#include <linux/module.h>
17#include <asm/uaccess.h> 17#include <asm/uaccess.h>
18#include <asm/system.h>
18#include <linux/bitops.h> 19#include <linux/bitops.h>
19#include <linux/capability.h> 20#include <linux/capability.h>
20#include <linux/types.h> 21#include <linux/types.h>
@@ -31,7 +32,6 @@
31#include <linux/if_addr.h> 32#include <linux/if_addr.h>
32#include <linux/if_arp.h> 33#include <linux/if_arp.h>
33#include <linux/skbuff.h> 34#include <linux/skbuff.h>
34#include <linux/cache.h>
35#include <linux/init.h> 35#include <linux/init.h>
36#include <linux/list.h> 36#include <linux/list.h>
37#include <linux/slab.h> 37#include <linux/slab.h>
@@ -86,24 +86,6 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
86 tb = fib_trie_table(id); 86 tb = fib_trie_table(id);
87 if (!tb) 87 if (!tb)
88 return NULL; 88 return NULL;
89
90 switch (id) {
91 case RT_TABLE_LOCAL:
92 net->ipv4.fib_local = tb;
93 break;
94
95 case RT_TABLE_MAIN:
96 net->ipv4.fib_main = tb;
97 break;
98
99 case RT_TABLE_DEFAULT:
100 net->ipv4.fib_default = tb;
101 break;
102
103 default:
104 break;
105 }
106
107 h = id & (FIB_TABLE_HASHSZ - 1); 89 h = id & (FIB_TABLE_HASHSZ - 1);
108 hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]); 90 hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
109 return tb; 91 return tb;
@@ -148,20 +130,20 @@ static void fib_flush(struct net *net)
148 } 130 }
149 131
150 if (flushed) 132 if (flushed)
151 rt_cache_flush(net); 133 rt_cache_flush(net, -1);
152} 134}
153 135
154/* 136/*
155 * Find address type as if only "dev" was present in the system. If 137 * Find address type as if only "dev" was present in the system. If
156 * on_dev is NULL then all interfaces are taken into consideration. 138 * on_dev is NULL then all interfaces are taken into consideration.
157 */ 139 */
158static inline unsigned int __inet_dev_addr_type(struct net *net, 140static inline unsigned __inet_dev_addr_type(struct net *net,
159 const struct net_device *dev, 141 const struct net_device *dev,
160 __be32 addr) 142 __be32 addr)
161{ 143{
162 struct flowi4 fl4 = { .daddr = addr }; 144 struct flowi4 fl4 = { .daddr = addr };
163 struct fib_result res; 145 struct fib_result res;
164 unsigned int ret = RTN_BROADCAST; 146 unsigned ret = RTN_BROADCAST;
165 struct fib_table *local_table; 147 struct fib_table *local_table;
166 148
167 if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr)) 149 if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
@@ -169,6 +151,10 @@ static inline unsigned int __inet_dev_addr_type(struct net *net,
169 if (ipv4_is_multicast(addr)) 151 if (ipv4_is_multicast(addr))
170 return RTN_MULTICAST; 152 return RTN_MULTICAST;
171 153
154#ifdef CONFIG_IP_MULTIPLE_TABLES
155 res.r = NULL;
156#endif
157
172 local_table = fib_get_table(net, RT_TABLE_LOCAL); 158 local_table = fib_get_table(net, RT_TABLE_LOCAL);
173 if (local_table) { 159 if (local_table) {
174 ret = RTN_UNICAST; 160 ret = RTN_UNICAST;
@@ -195,44 +181,6 @@ unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
195} 181}
196EXPORT_SYMBOL(inet_dev_addr_type); 182EXPORT_SYMBOL(inet_dev_addr_type);
197 183
198__be32 fib_compute_spec_dst(struct sk_buff *skb)
199{
200 struct net_device *dev = skb->dev;
201 struct in_device *in_dev;
202 struct fib_result res;
203 struct rtable *rt;
204 struct flowi4 fl4;
205 struct net *net;
206 int scope;
207
208 rt = skb_rtable(skb);
209 if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) ==
210 RTCF_LOCAL)
211 return ip_hdr(skb)->daddr;
212
213 in_dev = __in_dev_get_rcu(dev);
214 BUG_ON(!in_dev);
215
216 net = dev_net(dev);
217
218 scope = RT_SCOPE_UNIVERSE;
219 if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
220 fl4.flowi4_oif = 0;
221 fl4.flowi4_iif = LOOPBACK_IFINDEX;
222 fl4.daddr = ip_hdr(skb)->saddr;
223 fl4.saddr = 0;
224 fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
225 fl4.flowi4_scope = scope;
226 fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
227 if (!fib_lookup(net, &fl4, &res))
228 return FIB_RES_PREFSRC(net, res);
229 } else {
230 scope = RT_SCOPE_LINK;
231 }
232
233 return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
234}
235
236/* Given (packet source, input interface) and optional (dst, oif, tos): 184/* Given (packet source, input interface) and optional (dst, oif, tos):
237 * - (main) check, that source is valid i.e. not broadcast or our local 185 * - (main) check, that source is valid i.e. not broadcast or our local
238 * address. 186 * address.
@@ -241,15 +189,17 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
241 * - check, that packet arrived from expected physical interface. 189 * - check, that packet arrived from expected physical interface.
242 * called with rcu_read_lock() 190 * called with rcu_read_lock()
243 */ 191 */
244static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, 192int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos,
245 u8 tos, int oif, struct net_device *dev, 193 int oif, struct net_device *dev, __be32 *spec_dst,
246 int rpf, struct in_device *idev, u32 *itag) 194 u32 *itag)
247{ 195{
248 int ret, no_addr, accept_local; 196 struct in_device *in_dev;
249 struct fib_result res;
250 struct flowi4 fl4; 197 struct flowi4 fl4;
251 struct net *net; 198 struct fib_result res;
199 int no_addr, rpf, accept_local;
252 bool dev_match; 200 bool dev_match;
201 int ret;
202 struct net *net;
253 203
254 fl4.flowi4_oif = 0; 204 fl4.flowi4_oif = 0;
255 fl4.flowi4_iif = oif; 205 fl4.flowi4_iif = oif;
@@ -258,10 +208,20 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
258 fl4.flowi4_tos = tos; 208 fl4.flowi4_tos = tos;
259 fl4.flowi4_scope = RT_SCOPE_UNIVERSE; 209 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
260 210
261 no_addr = idev->ifa_list == NULL; 211 no_addr = rpf = accept_local = 0;
212 in_dev = __in_dev_get_rcu(dev);
213 if (in_dev) {
214 no_addr = in_dev->ifa_list == NULL;
215
216 /* Ignore rp_filter for packets protected by IPsec. */
217 rpf = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(in_dev);
218
219 accept_local = IN_DEV_ACCEPT_LOCAL(in_dev);
220 fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
221 }
262 222
263 accept_local = IN_DEV_ACCEPT_LOCAL(idev); 223 if (in_dev == NULL)
264 fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; 224 goto e_inval;
265 225
266 net = dev_net(dev); 226 net = dev_net(dev);
267 if (fib_lookup(net, &fl4, &res)) 227 if (fib_lookup(net, &fl4, &res))
@@ -270,6 +230,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
270 if (res.type != RTN_LOCAL || !accept_local) 230 if (res.type != RTN_LOCAL || !accept_local)
271 goto e_inval; 231 goto e_inval;
272 } 232 }
233 *spec_dst = FIB_RES_PREFSRC(net, res);
273 fib_combine_itag(itag, &res); 234 fib_combine_itag(itag, &res);
274 dev_match = false; 235 dev_match = false;
275 236
@@ -298,14 +259,17 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
298 259
299 ret = 0; 260 ret = 0;
300 if (fib_lookup(net, &fl4, &res) == 0) { 261 if (fib_lookup(net, &fl4, &res) == 0) {
301 if (res.type == RTN_UNICAST) 262 if (res.type == RTN_UNICAST) {
263 *spec_dst = FIB_RES_PREFSRC(net, res);
302 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; 264 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
265 }
303 } 266 }
304 return ret; 267 return ret;
305 268
306last_resort: 269last_resort:
307 if (rpf) 270 if (rpf)
308 goto e_rpf; 271 goto e_rpf;
272 *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
309 *itag = 0; 273 *itag = 0;
310 return 0; 274 return 0;
311 275
@@ -315,21 +279,6 @@ e_rpf:
315 return -EXDEV; 279 return -EXDEV;
316} 280}
317 281
318/* Ignore rp_filter for packets protected by IPsec. */
319int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
320 u8 tos, int oif, struct net_device *dev,
321 struct in_device *idev, u32 *itag)
322{
323 int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
324
325 if (!r && !fib_num_tclassid_users(dev_net(dev)) &&
326 (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) {
327 *itag = 0;
328 return 0;
329 }
330 return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag);
331}
332
333static inline __be32 sk_extract_addr(struct sockaddr *addr) 282static inline __be32 sk_extract_addr(struct sockaddr *addr)
334{ 283{
335 return ((struct sockaddr_in *) addr)->sin_addr.s_addr; 284 return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
@@ -488,7 +437,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
488 switch (cmd) { 437 switch (cmd) {
489 case SIOCADDRT: /* Add a route */ 438 case SIOCADDRT: /* Add a route */
490 case SIOCDELRT: /* Delete a route */ 439 case SIOCDELRT: /* Delete a route */
491 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 440 if (!capable(CAP_NET_ADMIN))
492 return -EPERM; 441 return -EPERM;
493 442
494 if (copy_from_user(&rt, arg, sizeof(rt))) 443 if (copy_from_user(&rt, arg, sizeof(rt)))
@@ -558,7 +507,7 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
558 cfg->fc_flags = rtm->rtm_flags; 507 cfg->fc_flags = rtm->rtm_flags;
559 cfg->fc_nlflags = nlh->nlmsg_flags; 508 cfg->fc_nlflags = nlh->nlmsg_flags;
560 509
561 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; 510 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
562 cfg->fc_nlinfo.nlh = nlh; 511 cfg->fc_nlinfo.nlh = nlh;
563 cfg->fc_nlinfo.nl_net = net; 512 cfg->fc_nlinfo.nl_net = net;
564 513
@@ -746,7 +695,7 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
746 if (ifa->ifa_flags & IFA_F_SECONDARY) { 695 if (ifa->ifa_flags & IFA_F_SECONDARY) {
747 prim = inet_ifa_byprefix(in_dev, prefix, mask); 696 prim = inet_ifa_byprefix(in_dev, prefix, mask);
748 if (prim == NULL) { 697 if (prim == NULL) {
749 pr_warn("%s: bug: prim == NULL\n", __func__); 698 printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n");
750 return; 699 return;
751 } 700 }
752 } 701 }
@@ -792,7 +741,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
792#define BRD_OK 2 741#define BRD_OK 2
793#define BRD0_OK 4 742#define BRD0_OK 4
794#define BRD1_OK 8 743#define BRD1_OK 8
795 unsigned int ok = 0; 744 unsigned ok = 0;
796 int subnet = 0; /* Primary network */ 745 int subnet = 0; /* Primary network */
797 int gone = 1; /* Address is missing */ 746 int gone = 1; /* Address is missing */
798 int same_prefsrc = 0; /* Another primary with same IP */ 747 int same_prefsrc = 0; /* Another primary with same IP */
@@ -800,11 +749,11 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
800 if (ifa->ifa_flags & IFA_F_SECONDARY) { 749 if (ifa->ifa_flags & IFA_F_SECONDARY) {
801 prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); 750 prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
802 if (prim == NULL) { 751 if (prim == NULL) {
803 pr_warn("%s: bug: prim == NULL\n", __func__); 752 printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n");
804 return; 753 return;
805 } 754 }
806 if (iprim && iprim != prim) { 755 if (iprim && iprim != prim) {
807 pr_warn("%s: bug: iprim != prim\n", __func__); 756 printk(KERN_WARNING "fib_del_ifaddr: bug: iprim != prim\n");
808 return; 757 return;
809 } 758 }
810 } else if (!ipv4_is_zeronet(any) && 759 } else if (!ipv4_is_zeronet(any) &&
@@ -931,6 +880,10 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
931 .flowi4_scope = frn->fl_scope, 880 .flowi4_scope = frn->fl_scope,
932 }; 881 };
933 882
883#ifdef CONFIG_IP_MULTIPLE_TABLES
884 res.r = NULL;
885#endif
886
934 frn->err = -ENOENT; 887 frn->err = -ENOENT;
935 if (tb) { 888 if (tb) {
936 local_bh_disable(); 889 local_bh_disable();
@@ -956,7 +909,7 @@ static void nl_fib_input(struct sk_buff *skb)
956 struct fib_result_nl *frn; 909 struct fib_result_nl *frn;
957 struct nlmsghdr *nlh; 910 struct nlmsghdr *nlh;
958 struct fib_table *tb; 911 struct fib_table *tb;
959 u32 portid; 912 u32 pid;
960 913
961 net = sock_net(skb->sk); 914 net = sock_net(skb->sk);
962 nlh = nlmsg_hdr(skb); 915 nlh = nlmsg_hdr(skb);
@@ -974,20 +927,17 @@ static void nl_fib_input(struct sk_buff *skb)
974 927
975 nl_fib_lookup(frn, tb); 928 nl_fib_lookup(frn, tb);
976 929
977 portid = NETLINK_CB(skb).portid; /* pid of sending process */ 930 pid = NETLINK_CB(skb).pid; /* pid of sending process */
978 NETLINK_CB(skb).portid = 0; /* from kernel */ 931 NETLINK_CB(skb).pid = 0; /* from kernel */
979 NETLINK_CB(skb).dst_group = 0; /* unicast */ 932 NETLINK_CB(skb).dst_group = 0; /* unicast */
980 netlink_unicast(net->ipv4.fibnl, skb, portid, MSG_DONTWAIT); 933 netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
981} 934}
982 935
983static int __net_init nl_fib_lookup_init(struct net *net) 936static int __net_init nl_fib_lookup_init(struct net *net)
984{ 937{
985 struct sock *sk; 938 struct sock *sk;
986 struct netlink_kernel_cfg cfg = { 939 sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0,
987 .input = nl_fib_input, 940 nl_fib_input, NULL, THIS_MODULE);
988 };
989
990 sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg);
991 if (sk == NULL) 941 if (sk == NULL)
992 return -EAFNOSUPPORT; 942 return -EAFNOSUPPORT;
993 net->ipv4.fibnl = sk; 943 net->ipv4.fibnl = sk;
@@ -1000,11 +950,11 @@ static void nl_fib_lookup_exit(struct net *net)
1000 net->ipv4.fibnl = NULL; 950 net->ipv4.fibnl = NULL;
1001} 951}
1002 952
1003static void fib_disable_ip(struct net_device *dev, int force) 953static void fib_disable_ip(struct net_device *dev, int force, int delay)
1004{ 954{
1005 if (fib_sync_down_dev(dev, force)) 955 if (fib_sync_down_dev(dev, force))
1006 fib_flush(dev_net(dev)); 956 fib_flush(dev_net(dev));
1007 rt_cache_flush(dev_net(dev)); 957 rt_cache_flush(dev_net(dev), delay);
1008 arp_ifdown(dev); 958 arp_ifdown(dev);
1009} 959}
1010 960
@@ -1021,7 +971,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
1021 fib_sync_up(dev); 971 fib_sync_up(dev);
1022#endif 972#endif
1023 atomic_inc(&net->ipv4.dev_addr_genid); 973 atomic_inc(&net->ipv4.dev_addr_genid);
1024 rt_cache_flush(dev_net(dev)); 974 rt_cache_flush(dev_net(dev), -1);
1025 break; 975 break;
1026 case NETDEV_DOWN: 976 case NETDEV_DOWN:
1027 fib_del_ifaddr(ifa, NULL); 977 fib_del_ifaddr(ifa, NULL);
@@ -1030,9 +980,9 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
1030 /* Last address was deleted from this interface. 980 /* Last address was deleted from this interface.
1031 * Disable IP. 981 * Disable IP.
1032 */ 982 */
1033 fib_disable_ip(dev, 1); 983 fib_disable_ip(dev, 1, 0);
1034 } else { 984 } else {
1035 rt_cache_flush(dev_net(dev)); 985 rt_cache_flush(dev_net(dev), -1);
1036 } 986 }
1037 break; 987 break;
1038 } 988 }
@@ -1042,16 +992,16 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
1042static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) 992static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
1043{ 993{
1044 struct net_device *dev = ptr; 994 struct net_device *dev = ptr;
1045 struct in_device *in_dev; 995 struct in_device *in_dev = __in_dev_get_rtnl(dev);
1046 struct net *net = dev_net(dev); 996 struct net *net = dev_net(dev);
1047 997
1048 if (event == NETDEV_UNREGISTER) { 998 if (event == NETDEV_UNREGISTER) {
1049 fib_disable_ip(dev, 2); 999 fib_disable_ip(dev, 2, -1);
1050 rt_flush_dev(dev);
1051 return NOTIFY_DONE; 1000 return NOTIFY_DONE;
1052 } 1001 }
1053 1002
1054 in_dev = __in_dev_get_rtnl(dev); 1003 if (!in_dev)
1004 return NOTIFY_DONE;
1055 1005
1056 switch (event) { 1006 switch (event) {
1057 case NETDEV_UP: 1007 case NETDEV_UP:
@@ -1062,14 +1012,21 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
1062 fib_sync_up(dev); 1012 fib_sync_up(dev);
1063#endif 1013#endif
1064 atomic_inc(&net->ipv4.dev_addr_genid); 1014 atomic_inc(&net->ipv4.dev_addr_genid);
1065 rt_cache_flush(net); 1015 rt_cache_flush(dev_net(dev), -1);
1066 break; 1016 break;
1067 case NETDEV_DOWN: 1017 case NETDEV_DOWN:
1068 fib_disable_ip(dev, 0); 1018 fib_disable_ip(dev, 0, 0);
1069 break; 1019 break;
1070 case NETDEV_CHANGEMTU: 1020 case NETDEV_CHANGEMTU:
1071 case NETDEV_CHANGE: 1021 case NETDEV_CHANGE:
1072 rt_cache_flush(net); 1022 rt_cache_flush(dev_net(dev), 0);
1023 break;
1024 case NETDEV_UNREGISTER_BATCH:
1025 /* The batch unregister is only called on the first
1026 * device in the list of devices being unregistered.
1027 * Therefore we should not pass dev_net(dev) in here.
1028 */
1029 rt_cache_flush_batch(NULL);
1073 break; 1030 break;
1074 } 1031 }
1075 return NOTIFY_DONE; 1032 return NOTIFY_DONE;
@@ -1134,9 +1091,6 @@ static int __net_init fib_net_init(struct net *net)
1134{ 1091{
1135 int error; 1092 int error;
1136 1093
1137#ifdef CONFIG_IP_ROUTE_CLASSID
1138 net->ipv4.fib_num_tclassid_users = 0;
1139#endif
1140 error = ip_fib_net_init(net); 1094 error = ip_fib_net_init(net);
1141 if (error < 0) 1095 if (error < 0)
1142 goto out; 1096 goto out;
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 26aa65d1fce..a53bb1b5b11 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -26,7 +26,6 @@
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/list.h> 27#include <linux/list.h>
28#include <linux/rcupdate.h> 28#include <linux/rcupdate.h>
29#include <linux/export.h>
30#include <net/ip.h> 29#include <net/ip.h>
31#include <net/route.h> 30#include <net/route.h>
32#include <net/tcp.h> 31#include <net/tcp.h>
@@ -47,7 +46,14 @@ struct fib4_rule {
47#endif 46#endif
48}; 47};
49 48
50int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) 49#ifdef CONFIG_IP_ROUTE_CLASSID
50u32 fib_rules_tclass(const struct fib_result *res)
51{
52 return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0;
53}
54#endif
55
56int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
51{ 57{
52 struct fib_lookup_arg arg = { 58 struct fib_lookup_arg arg = {
53 .result = res, 59 .result = res,
@@ -56,15 +62,10 @@ int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
56 int err; 62 int err;
57 63
58 err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg); 64 err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg);
59#ifdef CONFIG_IP_ROUTE_CLASSID 65 res->r = arg.rule;
60 if (arg.rule) 66
61 res->tclassid = ((struct fib4_rule *)arg.rule)->tclassid;
62 else
63 res->tclassid = 0;
64#endif
65 return err; 67 return err;
66} 68}
67EXPORT_SYMBOL_GPL(__fib_lookup);
68 69
69static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, 70static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
70 int flags, struct fib_lookup_arg *arg) 71 int flags, struct fib_lookup_arg *arg)
@@ -166,11 +167,8 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
166 rule4->dst = nla_get_be32(tb[FRA_DST]); 167 rule4->dst = nla_get_be32(tb[FRA_DST]);
167 168
168#ifdef CONFIG_IP_ROUTE_CLASSID 169#ifdef CONFIG_IP_ROUTE_CLASSID
169 if (tb[FRA_FLOW]) { 170 if (tb[FRA_FLOW])
170 rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); 171 rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
171 if (rule4->tclassid)
172 net->ipv4.fib_num_tclassid_users++;
173 }
174#endif 172#endif
175 173
176 rule4->src_len = frh->src_len; 174 rule4->src_len = frh->src_len;
@@ -179,24 +177,11 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
179 rule4->dstmask = inet_make_mask(rule4->dst_len); 177 rule4->dstmask = inet_make_mask(rule4->dst_len);
180 rule4->tos = frh->tos; 178 rule4->tos = frh->tos;
181 179
182 net->ipv4.fib_has_custom_rules = true;
183 err = 0; 180 err = 0;
184errout: 181errout:
185 return err; 182 return err;
186} 183}
187 184
188static void fib4_rule_delete(struct fib_rule *rule)
189{
190 struct net *net = rule->fr_net;
191#ifdef CONFIG_IP_ROUTE_CLASSID
192 struct fib4_rule *rule4 = (struct fib4_rule *) rule;
193
194 if (rule4->tclassid)
195 net->ipv4.fib_num_tclassid_users--;
196#endif
197 net->ipv4.fib_has_custom_rules = true;
198}
199
200static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, 185static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
201 struct nlattr **tb) 186 struct nlattr **tb)
202{ 187{
@@ -234,15 +219,15 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
234 frh->src_len = rule4->src_len; 219 frh->src_len = rule4->src_len;
235 frh->tos = rule4->tos; 220 frh->tos = rule4->tos;
236 221
237 if ((rule4->dst_len && 222 if (rule4->dst_len)
238 nla_put_be32(skb, FRA_DST, rule4->dst)) || 223 NLA_PUT_BE32(skb, FRA_DST, rule4->dst);
239 (rule4->src_len && 224
240 nla_put_be32(skb, FRA_SRC, rule4->src))) 225 if (rule4->src_len)
241 goto nla_put_failure; 226 NLA_PUT_BE32(skb, FRA_SRC, rule4->src);
227
242#ifdef CONFIG_IP_ROUTE_CLASSID 228#ifdef CONFIG_IP_ROUTE_CLASSID
243 if (rule4->tclassid && 229 if (rule4->tclassid)
244 nla_put_u32(skb, FRA_FLOW, rule4->tclassid)) 230 NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid);
245 goto nla_put_failure;
246#endif 231#endif
247 return 0; 232 return 0;
248 233
@@ -259,17 +244,16 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule)
259 244
260static void fib4_rule_flush_cache(struct fib_rules_ops *ops) 245static void fib4_rule_flush_cache(struct fib_rules_ops *ops)
261{ 246{
262 rt_cache_flush(ops->fro_net); 247 rt_cache_flush(ops->fro_net, -1);
263} 248}
264 249
265static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = { 250static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = {
266 .family = AF_INET, 251 .family = AF_INET,
267 .rule_size = sizeof(struct fib4_rule), 252 .rule_size = sizeof(struct fib4_rule),
268 .addr_size = sizeof(u32), 253 .addr_size = sizeof(u32),
269 .action = fib4_rule_action, 254 .action = fib4_rule_action,
270 .match = fib4_rule_match, 255 .match = fib4_rule_match,
271 .configure = fib4_rule_configure, 256 .configure = fib4_rule_configure,
272 .delete = fib4_rule_delete,
273 .compare = fib4_rule_compare, 257 .compare = fib4_rule_compare,
274 .fill = fib4_rule_fill, 258 .fill = fib4_rule_fill,
275 .default_pref = fib_default_rule_pref, 259 .default_pref = fib_default_rule_pref,
@@ -309,7 +293,6 @@ int __net_init fib4_rules_init(struct net *net)
309 if (err < 0) 293 if (err < 0)
310 goto fail; 294 goto fail;
311 net->ipv4.rules_ops = ops; 295 net->ipv4.rules_ops = ops;
312 net->ipv4.fib_has_custom_rules = false;
313 return 0; 296 return 0;
314 297
315fail: 298fail:
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 4797a800faf..80106d89d54 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -14,6 +14,7 @@
14 */ 14 */
15 15
16#include <asm/uaccess.h> 16#include <asm/uaccess.h>
17#include <asm/system.h>
17#include <linux/bitops.h> 18#include <linux/bitops.h>
18#include <linux/types.h> 19#include <linux/types.h>
19#include <linux/kernel.h> 20#include <linux/kernel.h>
@@ -140,77 +141,11 @@ const struct fib_prop fib_props[RTN_MAX + 1] = {
140 }, 141 },
141}; 142};
142 143
143static void rt_fibinfo_free(struct rtable __rcu **rtp)
144{
145 struct rtable *rt = rcu_dereference_protected(*rtp, 1);
146
147 if (!rt)
148 return;
149
150 /* Not even needed : RCU_INIT_POINTER(*rtp, NULL);
151 * because we waited an RCU grace period before calling
152 * free_fib_info_rcu()
153 */
154
155 dst_free(&rt->dst);
156}
157
158static void free_nh_exceptions(struct fib_nh *nh)
159{
160 struct fnhe_hash_bucket *hash = nh->nh_exceptions;
161 int i;
162
163 for (i = 0; i < FNHE_HASH_SIZE; i++) {
164 struct fib_nh_exception *fnhe;
165
166 fnhe = rcu_dereference_protected(hash[i].chain, 1);
167 while (fnhe) {
168 struct fib_nh_exception *next;
169
170 next = rcu_dereference_protected(fnhe->fnhe_next, 1);
171
172 rt_fibinfo_free(&fnhe->fnhe_rth);
173
174 kfree(fnhe);
175
176 fnhe = next;
177 }
178 }
179 kfree(hash);
180}
181
182static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
183{
184 int cpu;
185
186 if (!rtp)
187 return;
188
189 for_each_possible_cpu(cpu) {
190 struct rtable *rt;
191
192 rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1);
193 if (rt)
194 dst_free(&rt->dst);
195 }
196 free_percpu(rtp);
197}
198
199/* Release a nexthop info record */ 144/* Release a nexthop info record */
200static void free_fib_info_rcu(struct rcu_head *head) 145static void free_fib_info_rcu(struct rcu_head *head)
201{ 146{
202 struct fib_info *fi = container_of(head, struct fib_info, rcu); 147 struct fib_info *fi = container_of(head, struct fib_info, rcu);
203 148
204 change_nexthops(fi) {
205 if (nexthop_nh->nh_dev)
206 dev_put(nexthop_nh->nh_dev);
207 if (nexthop_nh->nh_exceptions)
208 free_nh_exceptions(nexthop_nh);
209 rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output);
210 rt_fibinfo_free(&nexthop_nh->nh_rth_input);
211 } endfor_nexthops(fi);
212
213 release_net(fi->fib_net);
214 if (fi->fib_metrics != (u32 *) dst_default_metrics) 149 if (fi->fib_metrics != (u32 *) dst_default_metrics)
215 kfree(fi->fib_metrics); 150 kfree(fi->fib_metrics);
216 kfree(fi); 151 kfree(fi);
@@ -219,16 +154,16 @@ static void free_fib_info_rcu(struct rcu_head *head)
219void free_fib_info(struct fib_info *fi) 154void free_fib_info(struct fib_info *fi)
220{ 155{
221 if (fi->fib_dead == 0) { 156 if (fi->fib_dead == 0) {
222 pr_warn("Freeing alive fib_info %p\n", fi); 157 pr_warning("Freeing alive fib_info %p\n", fi);
223 return; 158 return;
224 } 159 }
225 fib_info_cnt--;
226#ifdef CONFIG_IP_ROUTE_CLASSID
227 change_nexthops(fi) { 160 change_nexthops(fi) {
228 if (nexthop_nh->nh_tclassid) 161 if (nexthop_nh->nh_dev)
229 fi->fib_net->ipv4.fib_num_tclassid_users--; 162 dev_put(nexthop_nh->nh_dev);
163 nexthop_nh->nh_dev = NULL;
230 } endfor_nexthops(fi); 164 } endfor_nexthops(fi);
231#endif 165 fib_info_cnt--;
166 release_net(fi->fib_net);
232 call_rcu(&fi->rcu, free_fib_info_rcu); 167 call_rcu(&fi->rcu, free_fib_info_rcu);
233} 168}
234 169
@@ -314,7 +249,6 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
314 nfi->fib_scope == fi->fib_scope && 249 nfi->fib_scope == fi->fib_scope &&
315 nfi->fib_prefsrc == fi->fib_prefsrc && 250 nfi->fib_prefsrc == fi->fib_prefsrc &&
316 nfi->fib_priority == fi->fib_priority && 251 nfi->fib_priority == fi->fib_priority &&
317 nfi->fib_type == fi->fib_type &&
318 memcmp(nfi->fib_metrics, fi->fib_metrics, 252 memcmp(nfi->fib_metrics, fi->fib_metrics,
319 sizeof(u32) * RTAX_MAX) == 0 && 253 sizeof(u32) * RTAX_MAX) == 0 &&
320 ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 && 254 ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
@@ -392,7 +326,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
392 if (skb == NULL) 326 if (skb == NULL)
393 goto errout; 327 goto errout;
394 328
395 err = fib_dump_info(skb, info->portid, seq, event, tb_id, 329 err = fib_dump_info(skb, info->pid, seq, event, tb_id,
396 fa->fa_type, key, dst_len, 330 fa->fa_type, key, dst_len,
397 fa->fa_tos, fa->fa_info, nlm_flags); 331 fa->fa_tos, fa->fa_info, nlm_flags);
398 if (err < 0) { 332 if (err < 0) {
@@ -401,7 +335,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
401 kfree_skb(skb); 335 kfree_skb(skb);
402 goto errout; 336 goto errout;
403 } 337 }
404 rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE, 338 rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
405 info->nlh, GFP_KERNEL); 339 info->nlh, GFP_KERNEL);
406 return; 340 return;
407errout: 341errout:
@@ -488,8 +422,6 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
488#ifdef CONFIG_IP_ROUTE_CLASSID 422#ifdef CONFIG_IP_ROUTE_CLASSID
489 nla = nla_find(attrs, attrlen, RTA_FLOW); 423 nla = nla_find(attrs, attrlen, RTA_FLOW);
490 nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; 424 nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
491 if (nexthop_nh->nh_tclassid)
492 fi->fib_net->ipv4.fib_num_tclassid_users++;
493#endif 425#endif
494 } 426 }
495 427
@@ -803,7 +735,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
803 unsigned int bytes; 735 unsigned int bytes;
804 736
805 if (!new_size) 737 if (!new_size)
806 new_size = 16; 738 new_size = 1;
807 bytes = new_size * sizeof(struct hlist_head *); 739 bytes = new_size * sizeof(struct hlist_head *);
808 new_info_hash = fib_info_hash_alloc(bytes); 740 new_info_hash = fib_info_hash_alloc(bytes);
809 new_laddrhash = fib_info_hash_alloc(bytes); 741 new_laddrhash = fib_info_hash_alloc(bytes);
@@ -834,14 +766,10 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
834 fi->fib_flags = cfg->fc_flags; 766 fi->fib_flags = cfg->fc_flags;
835 fi->fib_priority = cfg->fc_priority; 767 fi->fib_priority = cfg->fc_priority;
836 fi->fib_prefsrc = cfg->fc_prefsrc; 768 fi->fib_prefsrc = cfg->fc_prefsrc;
837 fi->fib_type = cfg->fc_type;
838 769
839 fi->fib_nhs = nhs; 770 fi->fib_nhs = nhs;
840 change_nexthops(fi) { 771 change_nexthops(fi) {
841 nexthop_nh->nh_parent = fi; 772 nexthop_nh->nh_parent = fi;
842 nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *);
843 if (!nexthop_nh->nh_pcpu_rth_output)
844 goto failure;
845 } endfor_nexthops(fi) 773 } endfor_nexthops(fi)
846 774
847 if (cfg->fc_mx) { 775 if (cfg->fc_mx) {
@@ -852,16 +780,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
852 int type = nla_type(nla); 780 int type = nla_type(nla);
853 781
854 if (type) { 782 if (type) {
855 u32 val;
856
857 if (type > RTAX_MAX) 783 if (type > RTAX_MAX)
858 goto err_inval; 784 goto err_inval;
859 val = nla_get_u32(nla); 785 fi->fib_metrics[type - 1] = nla_get_u32(nla);
860 if (type == RTAX_ADVMSS && val > 65535 - 40)
861 val = 65535 - 40;
862 if (type == RTAX_MTU && val > 65535 - 15)
863 val = 65535 - 15;
864 fi->fib_metrics[type - 1] = val;
865 } 786 }
866 } 787 }
867 } 788 }
@@ -890,8 +811,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
890 nh->nh_flags = cfg->fc_flags; 811 nh->nh_flags = cfg->fc_flags;
891#ifdef CONFIG_IP_ROUTE_CLASSID 812#ifdef CONFIG_IP_ROUTE_CLASSID
892 nh->nh_tclassid = cfg->fc_flow; 813 nh->nh_tclassid = cfg->fc_flow;
893 if (nh->nh_tclassid)
894 fi->fib_net->ipv4.fib_num_tclassid_users++;
895#endif 814#endif
896#ifdef CONFIG_IP_ROUTE_MULTIPATH 815#ifdef CONFIG_IP_ROUTE_MULTIPATH
897 nh->nh_weight = 1; 816 nh->nh_weight = 1;
@@ -993,14 +912,14 @@ failure:
993 return ERR_PTR(err); 912 return ERR_PTR(err);
994} 913}
995 914
996int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, 915int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
997 u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, 916 u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,
998 struct fib_info *fi, unsigned int flags) 917 struct fib_info *fi, unsigned int flags)
999{ 918{
1000 struct nlmsghdr *nlh; 919 struct nlmsghdr *nlh;
1001 struct rtmsg *rtm; 920 struct rtmsg *rtm;
1002 921
1003 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags); 922 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
1004 if (nlh == NULL) 923 if (nlh == NULL)
1005 return -EMSGSIZE; 924 return -EMSGSIZE;
1006 925
@@ -1013,36 +932,33 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
1013 rtm->rtm_table = tb_id; 932 rtm->rtm_table = tb_id;
1014 else 933 else
1015 rtm->rtm_table = RT_TABLE_COMPAT; 934 rtm->rtm_table = RT_TABLE_COMPAT;
1016 if (nla_put_u32(skb, RTA_TABLE, tb_id)) 935 NLA_PUT_U32(skb, RTA_TABLE, tb_id);
1017 goto nla_put_failure;
1018 rtm->rtm_type = type; 936 rtm->rtm_type = type;
1019 rtm->rtm_flags = fi->fib_flags; 937 rtm->rtm_flags = fi->fib_flags;
1020 rtm->rtm_scope = fi->fib_scope; 938 rtm->rtm_scope = fi->fib_scope;
1021 rtm->rtm_protocol = fi->fib_protocol; 939 rtm->rtm_protocol = fi->fib_protocol;
1022 940
1023 if (rtm->rtm_dst_len && 941 if (rtm->rtm_dst_len)
1024 nla_put_be32(skb, RTA_DST, dst)) 942 NLA_PUT_BE32(skb, RTA_DST, dst);
1025 goto nla_put_failure; 943
1026 if (fi->fib_priority && 944 if (fi->fib_priority)
1027 nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority)) 945 NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
1028 goto nla_put_failure; 946
1029 if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0) 947 if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
1030 goto nla_put_failure; 948 goto nla_put_failure;
1031 949
1032 if (fi->fib_prefsrc && 950 if (fi->fib_prefsrc)
1033 nla_put_be32(skb, RTA_PREFSRC, fi->fib_prefsrc)) 951 NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
1034 goto nla_put_failure; 952
1035 if (fi->fib_nhs == 1) { 953 if (fi->fib_nhs == 1) {
1036 if (fi->fib_nh->nh_gw && 954 if (fi->fib_nh->nh_gw)
1037 nla_put_be32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw)) 955 NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
1038 goto nla_put_failure; 956
1039 if (fi->fib_nh->nh_oif && 957 if (fi->fib_nh->nh_oif)
1040 nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif)) 958 NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
1041 goto nla_put_failure;
1042#ifdef CONFIG_IP_ROUTE_CLASSID 959#ifdef CONFIG_IP_ROUTE_CLASSID
1043 if (fi->fib_nh[0].nh_tclassid && 960 if (fi->fib_nh[0].nh_tclassid)
1044 nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid)) 961 NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
1045 goto nla_put_failure;
1046#endif 962#endif
1047 } 963 }
1048#ifdef CONFIG_IP_ROUTE_MULTIPATH 964#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -1063,13 +979,11 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
1063 rtnh->rtnh_hops = nh->nh_weight - 1; 979 rtnh->rtnh_hops = nh->nh_weight - 1;
1064 rtnh->rtnh_ifindex = nh->nh_oif; 980 rtnh->rtnh_ifindex = nh->nh_oif;
1065 981
1066 if (nh->nh_gw && 982 if (nh->nh_gw)
1067 nla_put_be32(skb, RTA_GATEWAY, nh->nh_gw)) 983 NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1068 goto nla_put_failure;
1069#ifdef CONFIG_IP_ROUTE_CLASSID 984#ifdef CONFIG_IP_ROUTE_CLASSID
1070 if (nh->nh_tclassid && 985 if (nh->nh_tclassid)
1071 nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) 986 NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1072 goto nla_put_failure;
1073#endif 987#endif
1074 /* length of rtnetlink header + attributes */ 988 /* length of rtnetlink header + attributes */
1075 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh; 989 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 31d771ca9a7..de9e2978476 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -51,6 +51,7 @@
51#define VERSION "0.409" 51#define VERSION "0.409"
52 52
53#include <asm/uaccess.h> 53#include <asm/uaccess.h>
54#include <asm/system.h>
54#include <linux/bitops.h> 55#include <linux/bitops.h>
55#include <linux/types.h> 56#include <linux/types.h>
56#include <linux/kernel.h> 57#include <linux/kernel.h>
@@ -72,7 +73,6 @@
72#include <linux/list.h> 73#include <linux/list.h>
73#include <linux/slab.h> 74#include <linux/slab.h>
74#include <linux/prefetch.h> 75#include <linux/prefetch.h>
75#include <linux/export.h>
76#include <net/net_namespace.h> 76#include <net/net_namespace.h>
77#include <net/ip.h> 77#include <net/ip.h>
78#include <net/protocol.h> 78#include <net/protocol.h>
@@ -159,6 +159,7 @@ struct trie {
159#endif 159#endif
160}; 160};
161 161
162static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n);
162static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n, 163static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
163 int wasfull); 164 int wasfull);
164static struct rt_trie_node *resize(struct trie *t, struct tnode *tn); 165static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
@@ -367,7 +368,7 @@ static void __leaf_free_rcu(struct rcu_head *head)
367 368
368static inline void free_leaf(struct leaf *l) 369static inline void free_leaf(struct leaf *l)
369{ 370{
370 call_rcu(&l->rcu, __leaf_free_rcu); 371 call_rcu_bh(&l->rcu, __leaf_free_rcu);
371} 372}
372 373
373static inline void free_leaf_info(struct leaf_info *leaf) 374static inline void free_leaf_info(struct leaf_info *leaf)
@@ -472,7 +473,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
472 } 473 }
473 474
474 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode), 475 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
475 sizeof(struct rt_trie_node *) << bits); 476 sizeof(struct rt_trie_node) << bits);
476 return tn; 477 return tn;
477} 478}
478 479
@@ -489,7 +490,7 @@ static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *
489 return ((struct tnode *) n)->pos == tn->pos + tn->bits; 490 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
490} 491}
491 492
492static inline void put_child(struct tnode *tn, int i, 493static inline void put_child(struct trie *t, struct tnode *tn, int i,
493 struct rt_trie_node *n) 494 struct rt_trie_node *n)
494{ 495{
495 tnode_put_child_reorg(tn, i, n, -1); 496 tnode_put_child_reorg(tn, i, n, -1);
@@ -753,8 +754,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
753 goto nomem; 754 goto nomem;
754 } 755 }
755 756
756 put_child(tn, 2*i, (struct rt_trie_node *) left); 757 put_child(t, tn, 2*i, (struct rt_trie_node *) left);
757 put_child(tn, 2*i+1, (struct rt_trie_node *) right); 758 put_child(t, tn, 2*i+1, (struct rt_trie_node *) right);
758 } 759 }
759 } 760 }
760 761
@@ -775,9 +776,9 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
775 if (tkey_extract_bits(node->key, 776 if (tkey_extract_bits(node->key,
776 oldtnode->pos + oldtnode->bits, 777 oldtnode->pos + oldtnode->bits,
777 1) == 0) 778 1) == 0)
778 put_child(tn, 2*i, node); 779 put_child(t, tn, 2*i, node);
779 else 780 else
780 put_child(tn, 2*i+1, node); 781 put_child(t, tn, 2*i+1, node);
781 continue; 782 continue;
782 } 783 }
783 784
@@ -785,8 +786,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
785 inode = (struct tnode *) node; 786 inode = (struct tnode *) node;
786 787
787 if (inode->bits == 1) { 788 if (inode->bits == 1) {
788 put_child(tn, 2*i, rtnl_dereference(inode->child[0])); 789 put_child(t, tn, 2*i, rtnl_dereference(inode->child[0]));
789 put_child(tn, 2*i+1, rtnl_dereference(inode->child[1])); 790 put_child(t, tn, 2*i+1, rtnl_dereference(inode->child[1]));
790 791
791 tnode_free_safe(inode); 792 tnode_free_safe(inode);
792 continue; 793 continue;
@@ -816,22 +817,22 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
816 */ 817 */
817 818
818 left = (struct tnode *) tnode_get_child(tn, 2*i); 819 left = (struct tnode *) tnode_get_child(tn, 2*i);
819 put_child(tn, 2*i, NULL); 820 put_child(t, tn, 2*i, NULL);
820 821
821 BUG_ON(!left); 822 BUG_ON(!left);
822 823
823 right = (struct tnode *) tnode_get_child(tn, 2*i+1); 824 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
824 put_child(tn, 2*i+1, NULL); 825 put_child(t, tn, 2*i+1, NULL);
825 826
826 BUG_ON(!right); 827 BUG_ON(!right);
827 828
828 size = tnode_child_length(left); 829 size = tnode_child_length(left);
829 for (j = 0; j < size; j++) { 830 for (j = 0; j < size; j++) {
830 put_child(left, j, rtnl_dereference(inode->child[j])); 831 put_child(t, left, j, rtnl_dereference(inode->child[j]));
831 put_child(right, j, rtnl_dereference(inode->child[j + size])); 832 put_child(t, right, j, rtnl_dereference(inode->child[j + size]));
832 } 833 }
833 put_child(tn, 2*i, resize(t, left)); 834 put_child(t, tn, 2*i, resize(t, left));
834 put_child(tn, 2*i+1, resize(t, right)); 835 put_child(t, tn, 2*i+1, resize(t, right));
835 836
836 tnode_free_safe(inode); 837 tnode_free_safe(inode);
837 } 838 }
@@ -876,7 +877,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
876 if (!newn) 877 if (!newn)
877 goto nomem; 878 goto nomem;
878 879
879 put_child(tn, i/2, (struct rt_trie_node *)newn); 880 put_child(t, tn, i/2, (struct rt_trie_node *)newn);
880 } 881 }
881 882
882 } 883 }
@@ -891,21 +892,21 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
891 if (left == NULL) { 892 if (left == NULL) {
892 if (right == NULL) /* Both are empty */ 893 if (right == NULL) /* Both are empty */
893 continue; 894 continue;
894 put_child(tn, i/2, right); 895 put_child(t, tn, i/2, right);
895 continue; 896 continue;
896 } 897 }
897 898
898 if (right == NULL) { 899 if (right == NULL) {
899 put_child(tn, i/2, left); 900 put_child(t, tn, i/2, left);
900 continue; 901 continue;
901 } 902 }
902 903
903 /* Two nonempty children */ 904 /* Two nonempty children */
904 newBinNode = (struct tnode *) tnode_get_child(tn, i/2); 905 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
905 put_child(tn, i/2, NULL); 906 put_child(t, tn, i/2, NULL);
906 put_child(newBinNode, 0, left); 907 put_child(t, newBinNode, 0, left);
907 put_child(newBinNode, 1, right); 908 put_child(t, newBinNode, 1, right);
908 put_child(tn, i/2, resize(t, newBinNode)); 909 put_child(t, tn, i/2, resize(t, newBinNode));
909 } 910 }
910 tnode_free_safe(oldtnode); 911 tnode_free_safe(oldtnode);
911 return tn; 912 return tn;
@@ -1006,9 +1007,9 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
1006 while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) { 1007 while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
1007 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1008 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1008 wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); 1009 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1009 tn = (struct tnode *)resize(t, tn); 1010 tn = (struct tnode *) resize(t, (struct tnode *)tn);
1010 1011
1011 tnode_put_child_reorg(tp, cindex, 1012 tnode_put_child_reorg((struct tnode *)tp, cindex,
1012 (struct rt_trie_node *)tn, wasfull); 1013 (struct rt_trie_node *)tn, wasfull);
1013 1014
1014 tp = node_parent((struct rt_trie_node *) tn); 1015 tp = node_parent((struct rt_trie_node *) tn);
@@ -1023,7 +1024,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
1023 1024
1024 /* Handle last (top) tnode */ 1025 /* Handle last (top) tnode */
1025 if (IS_TNODE(tn)) 1026 if (IS_TNODE(tn))
1026 tn = (struct tnode *)resize(t, tn); 1027 tn = (struct tnode *)resize(t, (struct tnode *)tn);
1027 1028
1028 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); 1029 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1029 tnode_free_flush(); 1030 tnode_free_flush();
@@ -1124,7 +1125,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1124 node_set_parent((struct rt_trie_node *)l, tp); 1125 node_set_parent((struct rt_trie_node *)l, tp);
1125 1126
1126 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1127 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1127 put_child(tp, cindex, (struct rt_trie_node *)l); 1128 put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l);
1128 } else { 1129 } else {
1129 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ 1130 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1130 /* 1131 /*
@@ -1154,12 +1155,13 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1154 node_set_parent((struct rt_trie_node *)tn, tp); 1155 node_set_parent((struct rt_trie_node *)tn, tp);
1155 1156
1156 missbit = tkey_extract_bits(key, newpos, 1); 1157 missbit = tkey_extract_bits(key, newpos, 1);
1157 put_child(tn, missbit, (struct rt_trie_node *)l); 1158 put_child(t, tn, missbit, (struct rt_trie_node *)l);
1158 put_child(tn, 1-missbit, n); 1159 put_child(t, tn, 1-missbit, n);
1159 1160
1160 if (tp) { 1161 if (tp) {
1161 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1162 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1162 put_child(tp, cindex, (struct rt_trie_node *)tn); 1163 put_child(t, (struct tnode *)tp, cindex,
1164 (struct rt_trie_node *)tn);
1163 } else { 1165 } else {
1164 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); 1166 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1165 tp = tn; 1167 tp = tn;
@@ -1167,8 +1169,9 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1167 } 1169 }
1168 1170
1169 if (tp && tp->pos + tp->bits > 32) 1171 if (tp && tp->pos + tp->bits > 32)
1170 pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n", 1172 pr_warning("fib_trie"
1171 tp, tp->pos, tp->bits, key, plen); 1173 " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1174 tp, tp->pos, tp->bits, key, plen);
1172 1175
1173 /* Rebalance the trie */ 1176 /* Rebalance the trie */
1174 1177
@@ -1286,7 +1289,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1286 1289
1287 fib_release_info(fi_drop); 1290 fib_release_info(fi_drop);
1288 if (state & FA_S_ACCESSED) 1291 if (state & FA_S_ACCESSED)
1289 rt_cache_flush(cfg->fc_nlinfo.nl_net); 1292 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1290 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, 1293 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1291 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE); 1294 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1292 1295
@@ -1333,7 +1336,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1333 list_add_tail_rcu(&new_fa->fa_list, 1336 list_add_tail_rcu(&new_fa->fa_list,
1334 (fa ? &fa->fa_list : fa_head)); 1337 (fa ? &fa->fa_list : fa_head));
1335 1338
1336 rt_cache_flush(cfg->fc_nlinfo.nl_net); 1339 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1337 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, 1340 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1338 &cfg->fc_nlinfo, 0); 1341 &cfg->fc_nlinfo, 0);
1339succeeded: 1342succeeded:
@@ -1368,8 +1371,6 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
1368 1371
1369 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) 1372 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1370 continue; 1373 continue;
1371 if (fi->fib_dead)
1372 continue;
1373 if (fa->fa_info->fib_scope < flp->flowi4_scope) 1374 if (fa->fa_info->fib_scope < flp->flowi4_scope)
1374 continue; 1375 continue;
1375 fib_alias_accessed(fa); 1376 fib_alias_accessed(fa);
@@ -1550,8 +1551,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1550 * state.directly. 1551 * state.directly.
1551 */ 1552 */
1552 if (pref_mismatch) { 1553 if (pref_mismatch) {
1553 /* fls(x) = __fls(x) + 1 */ 1554 int mp = KEYLENGTH - fls(pref_mismatch);
1554 int mp = KEYLENGTH - __fls(pref_mismatch) - 1;
1555 1555
1556 if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0) 1556 if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
1557 goto backtrace; 1557 goto backtrace;
@@ -1606,7 +1606,6 @@ found:
1606 rcu_read_unlock(); 1606 rcu_read_unlock();
1607 return ret; 1607 return ret;
1608} 1608}
1609EXPORT_SYMBOL_GPL(fib_table_lookup);
1610 1609
1611/* 1610/*
1612 * Remove the leaf and return parent. 1611 * Remove the leaf and return parent.
@@ -1619,10 +1618,10 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l)
1619 1618
1620 if (tp) { 1619 if (tp) {
1621 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits); 1620 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1622 put_child(tp, cindex, NULL); 1621 put_child(t, (struct tnode *)tp, cindex, NULL);
1623 trie_rebalance(t, tp); 1622 trie_rebalance(t, tp);
1624 } else 1623 } else
1625 RCU_INIT_POINTER(t->trie, NULL); 1624 rcu_assign_pointer(t->trie, NULL);
1626 1625
1627 free_leaf(l); 1626 free_leaf(l);
1628} 1627}
@@ -1656,12 +1655,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1656 if (!l) 1655 if (!l)
1657 return -ESRCH; 1656 return -ESRCH;
1658 1657
1659 li = find_leaf_info(l, plen); 1658 fa_head = get_fa_head(l, plen);
1660
1661 if (!li)
1662 return -ESRCH;
1663
1664 fa_head = &li->falh;
1665 fa = fib_find_alias(fa_head, tos, 0); 1659 fa = fib_find_alias(fa_head, tos, 0);
1666 1660
1667 if (!fa) 1661 if (!fa)
@@ -1697,6 +1691,9 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1697 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, 1691 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1698 &cfg->fc_nlinfo, 0); 1692 &cfg->fc_nlinfo, 0);
1699 1693
1694 l = fib_find_node(t, key);
1695 li = find_leaf_info(l, plen);
1696
1700 list_del_rcu(&fa->fa_list); 1697 list_del_rcu(&fa->fa_list);
1701 1698
1702 if (!plen) 1699 if (!plen)
@@ -1711,7 +1708,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1711 trie_leaf_remove(t, l); 1708 trie_leaf_remove(t, l);
1712 1709
1713 if (fa->fa_state & FA_S_ACCESSED) 1710 if (fa->fa_state & FA_S_ACCESSED)
1714 rt_cache_flush(cfg->fc_nlinfo.nl_net); 1711 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1715 1712
1716 fib_release_info(fa->fa_info); 1713 fib_release_info(fa->fa_info);
1717 alias_free_mem_rcu(fa); 1714 alias_free_mem_rcu(fa);
@@ -1873,7 +1870,7 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1873 continue; 1870 continue;
1874 } 1871 }
1875 1872
1876 if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid, 1873 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1877 cb->nlh->nlmsg_seq, 1874 cb->nlh->nlmsg_seq,
1878 RTM_NEWROUTE, 1875 RTM_NEWROUTE,
1879 tb->tb_id, 1876 tb->tb_id,
diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c
index 42a491055c7..dbfc21de347 100644
--- a/net/ipv4/gre.c
+++ b/net/ipv4/gre.c
@@ -10,8 +10,6 @@
10 * 10 *
11 */ 11 */
12 12
13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14
15#include <linux/module.h> 13#include <linux/module.h>
16#include <linux/kernel.h> 14#include <linux/kernel.h>
17#include <linux/kmod.h> 15#include <linux/kmod.h>
@@ -36,7 +34,7 @@ int gre_add_protocol(const struct gre_protocol *proto, u8 version)
36 if (gre_proto[version]) 34 if (gre_proto[version])
37 goto err_out_unlock; 35 goto err_out_unlock;
38 36
39 RCU_INIT_POINTER(gre_proto[version], proto); 37 rcu_assign_pointer(gre_proto[version], proto);
40 spin_unlock(&gre_proto_lock); 38 spin_unlock(&gre_proto_lock);
41 return 0; 39 return 0;
42 40
@@ -56,7 +54,7 @@ int gre_del_protocol(const struct gre_protocol *proto, u8 version)
56 if (rcu_dereference_protected(gre_proto[version], 54 if (rcu_dereference_protected(gre_proto[version],
57 lockdep_is_held(&gre_proto_lock)) != proto) 55 lockdep_is_held(&gre_proto_lock)) != proto)
58 goto err_out_unlock; 56 goto err_out_unlock;
59 RCU_INIT_POINTER(gre_proto[version], NULL); 57 rcu_assign_pointer(gre_proto[version], NULL);
60 spin_unlock(&gre_proto_lock); 58 spin_unlock(&gre_proto_lock);
61 synchronize_rcu(); 59 synchronize_rcu();
62 return 0; 60 return 0;
@@ -120,10 +118,10 @@ static const struct net_protocol net_gre_protocol = {
120 118
121static int __init gre_init(void) 119static int __init gre_init(void)
122{ 120{
123 pr_info("GRE over IPv4 demultiplexor driver\n"); 121 pr_info("GRE over IPv4 demultiplexor driver");
124 122
125 if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) { 123 if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
126 pr_err("can't add protocol\n"); 124 pr_err("gre: can't add protocol\n");
127 return -EAGAIN; 125 return -EAGAIN;
128 } 126 }
129 127
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 17ff9fd7cdd..23ef31baa1a 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -62,8 +62,6 @@
62 * 62 *
63 */ 63 */
64 64
65#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
66
67#include <linux/module.h> 65#include <linux/module.h>
68#include <linux/types.h> 66#include <linux/types.h>
69#include <linux/jiffies.h> 67#include <linux/jiffies.h>
@@ -91,11 +89,11 @@
91#include <linux/errno.h> 89#include <linux/errno.h>
92#include <linux/timer.h> 90#include <linux/timer.h>
93#include <linux/init.h> 91#include <linux/init.h>
92#include <asm/system.h>
94#include <asm/uaccess.h> 93#include <asm/uaccess.h>
95#include <net/checksum.h> 94#include <net/checksum.h>
96#include <net/xfrm.h> 95#include <net/xfrm.h>
97#include <net/inet_common.h> 96#include <net/inet_common.h>
98#include <net/ip_fib.h>
99 97
100/* 98/*
101 * Build xmit assembly blocks 99 * Build xmit assembly blocks
@@ -254,11 +252,10 @@ static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
254 252
255 /* Limit if icmp type is enabled in ratemask. */ 253 /* Limit if icmp type is enabled in ratemask. */
256 if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { 254 if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) {
257 struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1); 255 if (!rt->peer)
258 rc = inet_peer_xrlim_allow(peer, 256 rt_bind_peer(rt, fl4->daddr, 1);
257 rc = inet_peer_xrlim_allow(rt->peer,
259 net->ipv4.sysctl_icmp_ratelimit); 258 net->ipv4.sysctl_icmp_ratelimit);
260 if (peer)
261 inet_putpeer(peer);
262 } 259 }
263out: 260out:
264 return rc; 261 return rc;
@@ -336,7 +333,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
336 struct flowi4 fl4; 333 struct flowi4 fl4;
337 struct sock *sk; 334 struct sock *sk;
338 struct inet_sock *inet; 335 struct inet_sock *inet;
339 __be32 daddr, saddr; 336 __be32 daddr;
340 337
341 if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb)) 338 if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
342 return; 339 return;
@@ -350,7 +347,6 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
350 347
351 inet->tos = ip_hdr(skb)->tos; 348 inet->tos = ip_hdr(skb)->tos;
352 daddr = ipc.addr = ip_hdr(skb)->saddr; 349 daddr = ipc.addr = ip_hdr(skb)->saddr;
353 saddr = fib_compute_spec_dst(skb);
354 ipc.opt = NULL; 350 ipc.opt = NULL;
355 ipc.tx_flags = 0; 351 ipc.tx_flags = 0;
356 if (icmp_param->replyopts.opt.opt.optlen) { 352 if (icmp_param->replyopts.opt.opt.optlen) {
@@ -360,7 +356,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
360 } 356 }
361 memset(&fl4, 0, sizeof(fl4)); 357 memset(&fl4, 0, sizeof(fl4));
362 fl4.daddr = daddr; 358 fl4.daddr = daddr;
363 fl4.saddr = saddr; 359 fl4.saddr = rt->rt_spec_dst;
364 fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); 360 fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
365 fl4.flowi4_proto = IPPROTO_ICMP; 361 fl4.flowi4_proto = IPPROTO_ICMP;
366 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); 362 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
@@ -572,7 +568,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
572 rcu_read_lock(); 568 rcu_read_lock();
573 if (rt_is_input_route(rt) && 569 if (rt_is_input_route(rt) &&
574 net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr) 570 net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)
575 dev = dev_get_by_index_rcu(net, inet_iif(skb_in)); 571 dev = dev_get_by_index_rcu(net, rt->rt_iif);
576 572
577 if (dev) 573 if (dev)
578 saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); 574 saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK);
@@ -635,27 +631,6 @@ out:;
635EXPORT_SYMBOL(icmp_send); 631EXPORT_SYMBOL(icmp_send);
636 632
637 633
638static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
639{
640 const struct iphdr *iph = (const struct iphdr *) skb->data;
641 const struct net_protocol *ipprot;
642 int protocol = iph->protocol;
643
644 /* Checkin full IP header plus 8 bytes of protocol to
645 * avoid additional coding at protocol handlers.
646 */
647 if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
648 return;
649
650 raw_icmp_error(skb, protocol, info);
651
652 rcu_read_lock();
653 ipprot = rcu_dereference(inet_protos[protocol]);
654 if (ipprot && ipprot->err_handler)
655 ipprot->err_handler(skb, info);
656 rcu_read_unlock();
657}
658
659/* 634/*
660 * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH. 635 * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH.
661 */ 636 */
@@ -664,8 +639,10 @@ static void icmp_unreach(struct sk_buff *skb)
664{ 639{
665 const struct iphdr *iph; 640 const struct iphdr *iph;
666 struct icmphdr *icmph; 641 struct icmphdr *icmph;
667 struct net *net; 642 int hash, protocol;
643 const struct net_protocol *ipprot;
668 u32 info = 0; 644 u32 info = 0;
645 struct net *net;
669 646
670 net = dev_net(skb_dst(skb)->dev); 647 net = dev_net(skb_dst(skb)->dev);
671 648
@@ -693,16 +670,18 @@ static void icmp_unreach(struct sk_buff *skb)
693 break; 670 break;
694 case ICMP_FRAG_NEEDED: 671 case ICMP_FRAG_NEEDED:
695 if (ipv4_config.no_pmtu_disc) { 672 if (ipv4_config.no_pmtu_disc) {
696 LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"), 673 LIMIT_NETDEBUG(KERN_INFO "ICMP: %pI4: fragmentation needed and DF set.\n",
697 &iph->daddr); 674 &iph->daddr);
698 } else { 675 } else {
699 info = ntohs(icmph->un.frag.mtu); 676 info = ip_rt_frag_needed(net, iph,
677 ntohs(icmph->un.frag.mtu),
678 skb->dev);
700 if (!info) 679 if (!info)
701 goto out; 680 goto out;
702 } 681 }
703 break; 682 break;
704 case ICMP_SR_FAILED: 683 case ICMP_SR_FAILED:
705 LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: Source Route Failed\n"), 684 LIMIT_NETDEBUG(KERN_INFO "ICMP: %pI4: Source Route Failed.\n",
706 &iph->daddr); 685 &iph->daddr);
707 break; 686 break;
708 default: 687 default:
@@ -733,14 +712,37 @@ static void icmp_unreach(struct sk_buff *skb)
733 712
734 if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses && 713 if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses &&
735 inet_addr_type(net, iph->daddr) == RTN_BROADCAST) { 714 inet_addr_type(net, iph->daddr) == RTN_BROADCAST) {
736 net_warn_ratelimited("%pI4 sent an invalid ICMP type %u, code %u error to a broadcast: %pI4 on %s\n", 715 if (net_ratelimit())
737 &ip_hdr(skb)->saddr, 716 printk(KERN_WARNING "%pI4 sent an invalid ICMP "
738 icmph->type, icmph->code, 717 "type %u, code %u "
739 &iph->daddr, skb->dev->name); 718 "error to a broadcast: %pI4 on %s\n",
719 &ip_hdr(skb)->saddr,
720 icmph->type, icmph->code,
721 &iph->daddr,
722 skb->dev->name);
740 goto out; 723 goto out;
741 } 724 }
742 725
743 icmp_socket_deliver(skb, info); 726 /* Checkin full IP header plus 8 bytes of protocol to
727 * avoid additional coding at protocol handlers.
728 */
729 if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
730 goto out;
731
732 iph = (const struct iphdr *)skb->data;
733 protocol = iph->protocol;
734
735 /*
736 * Deliver ICMP message to raw sockets. Pretty useless feature?
737 */
738 raw_icmp_error(skb, protocol, info);
739
740 hash = protocol & (MAX_INET_PROTOS - 1);
741 rcu_read_lock();
742 ipprot = rcu_dereference(inet_protos[hash]);
743 if (ipprot && ipprot->err_handler)
744 ipprot->err_handler(skb, info);
745 rcu_read_unlock();
744 746
745out: 747out:
746 return; 748 return;
@@ -756,15 +758,46 @@ out_err:
756 758
757static void icmp_redirect(struct sk_buff *skb) 759static void icmp_redirect(struct sk_buff *skb)
758{ 760{
759 if (skb->len < sizeof(struct iphdr)) { 761 const struct iphdr *iph;
760 ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS); 762
761 return; 763 if (skb->len < sizeof(struct iphdr))
762 } 764 goto out_err;
763 765
766 /*
767 * Get the copied header of the packet that caused the redirect
768 */
764 if (!pskb_may_pull(skb, sizeof(struct iphdr))) 769 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
765 return; 770 goto out;
771
772 iph = (const struct iphdr *)skb->data;
766 773
767 icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway); 774 switch (icmp_hdr(skb)->code & 7) {
775 case ICMP_REDIR_NET:
776 case ICMP_REDIR_NETTOS:
777 /*
778 * As per RFC recommendations now handle it as a host redirect.
779 */
780 case ICMP_REDIR_HOST:
781 case ICMP_REDIR_HOSTTOS:
782 ip_rt_redirect(ip_hdr(skb)->saddr, iph->daddr,
783 icmp_hdr(skb)->un.gateway,
784 iph->saddr, skb->dev);
785 break;
786 }
787
788 /* Ping wants to see redirects.
789 * Let's pretend they are errors of sorts... */
790 if (iph->protocol == IPPROTO_ICMP &&
791 iph->ihl >= 5 &&
792 pskb_may_pull(skb, (iph->ihl<<2)+8)) {
793 ping_err(skb, icmp_hdr(skb)->un.gateway);
794 }
795
796out:
797 return;
798out_err:
799 ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
800 goto out;
768} 801}
769 802
770/* 803/*
@@ -838,6 +871,87 @@ out_err:
838 goto out; 871 goto out;
839} 872}
840 873
874
875/*
876 * Handle ICMP_ADDRESS_MASK requests. (RFC950)
877 *
878 * RFC1122 (3.2.2.9). A host MUST only send replies to
879 * ADDRESS_MASK requests if it's been configured as an address mask
880 * agent. Receiving a request doesn't constitute implicit permission to
881 * act as one. Of course, implementing this correctly requires (SHOULD)
882 * a way to turn the functionality on and off. Another one for sysctl(),
883 * I guess. -- MS
884 *
885 * RFC1812 (4.3.3.9). A router MUST implement it.
886 * A router SHOULD have switch turning it on/off.
887 * This switch MUST be ON by default.
888 *
889 * Gratuitous replies, zero-source replies are not implemented,
890 * that complies with RFC. DO NOT implement them!!! All the idea
891 * of broadcast addrmask replies as specified in RFC950 is broken.
892 * The problem is that it is not uncommon to have several prefixes
893 * on one physical interface. Moreover, addrmask agent can even be
894 * not aware of existing another prefixes.
895 * If source is zero, addrmask agent cannot choose correct prefix.
896 * Gratuitous mask announcements suffer from the same problem.
897 * RFC1812 explains it, but still allows to use ADDRMASK,
898 * that is pretty silly. --ANK
899 *
900 * All these rules are so bizarre, that I removed kernel addrmask
901 * support at all. It is wrong, it is obsolete, nobody uses it in
902 * any case. --ANK
903 *
904 * Furthermore you can do it with a usermode address agent program
905 * anyway...
906 */
907
908static void icmp_address(struct sk_buff *skb)
909{
910#if 0
911 if (net_ratelimit())
912 printk(KERN_DEBUG "a guy asks for address mask. Who is it?\n");
913#endif
914}
915
916/*
917 * RFC1812 (4.3.3.9). A router SHOULD listen all replies, and complain
918 * loudly if an inconsistency is found.
919 * called with rcu_read_lock()
920 */
921
922static void icmp_address_reply(struct sk_buff *skb)
923{
924 struct rtable *rt = skb_rtable(skb);
925 struct net_device *dev = skb->dev;
926 struct in_device *in_dev;
927 struct in_ifaddr *ifa;
928
929 if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC))
930 return;
931
932 in_dev = __in_dev_get_rcu(dev);
933 if (!in_dev)
934 return;
935
936 if (in_dev->ifa_list &&
937 IN_DEV_LOG_MARTIANS(in_dev) &&
938 IN_DEV_FORWARD(in_dev)) {
939 __be32 _mask, *mp;
940
941 mp = skb_header_pointer(skb, 0, sizeof(_mask), &_mask);
942 BUG_ON(mp == NULL);
943 for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
944 if (*mp == ifa->ifa_mask &&
945 inet_ifa_match(ip_hdr(skb)->saddr, ifa))
946 break;
947 }
948 if (!ifa && net_ratelimit()) {
949 printk(KERN_INFO "Wrong address mask %pI4 from %s/%pI4\n",
950 mp, dev->name, &ip_hdr(skb)->saddr);
951 }
952 }
953}
954
841static void icmp_discard(struct sk_buff *skb) 955static void icmp_discard(struct sk_buff *skb)
842{ 956{
843} 957}
@@ -1001,10 +1115,10 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
1001 .handler = icmp_discard, 1115 .handler = icmp_discard,
1002 }, 1116 },
1003 [ICMP_ADDRESS] = { 1117 [ICMP_ADDRESS] = {
1004 .handler = icmp_discard, 1118 .handler = icmp_address,
1005 }, 1119 },
1006 [ICMP_ADDRESSREPLY] = { 1120 [ICMP_ADDRESSREPLY] = {
1007 .handler = icmp_discard, 1121 .handler = icmp_address_reply,
1008 }, 1122 },
1009}; 1123};
1010 1124
@@ -1038,9 +1152,10 @@ static int __net_init icmp_sk_init(struct net *net)
1038 net->ipv4.icmp_sk[i] = sk; 1152 net->ipv4.icmp_sk[i] = sk;
1039 1153
1040 /* Enough space for 2 64K ICMP packets, including 1154 /* Enough space for 2 64K ICMP packets, including
1041 * sk_buff/skb_shared_info struct overhead. 1155 * sk_buff struct overhead.
1042 */ 1156 */
1043 sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024); 1157 sk->sk_sndbuf =
1158 (2 * ((64 * 1024) + sizeof(struct sk_buff)));
1044 1159
1045 /* 1160 /*
1046 * Speedup sock_wfree() 1161 * Speedup sock_wfree()
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 736ab70fd17..e0d42dbb33f 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -73,6 +73,7 @@
73#include <linux/module.h> 73#include <linux/module.h>
74#include <linux/slab.h> 74#include <linux/slab.h>
75#include <asm/uaccess.h> 75#include <asm/uaccess.h>
76#include <asm/system.h>
76#include <linux/types.h> 77#include <linux/types.h>
77#include <linux/kernel.h> 78#include <linux/kernel.h>
78#include <linux/jiffies.h> 79#include <linux/jiffies.h>
@@ -303,11 +304,9 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
303 struct igmpv3_report *pig; 304 struct igmpv3_report *pig;
304 struct net *net = dev_net(dev); 305 struct net *net = dev_net(dev);
305 struct flowi4 fl4; 306 struct flowi4 fl4;
306 int hlen = LL_RESERVED_SPACE(dev);
307 int tlen = dev->needed_tailroom;
308 307
309 while (1) { 308 while (1) {
310 skb = alloc_skb(size + hlen + tlen, 309 skb = alloc_skb(size + LL_ALLOCATED_SPACE(dev),
311 GFP_ATOMIC | __GFP_NOWARN); 310 GFP_ATOMIC | __GFP_NOWARN);
312 if (skb) 311 if (skb)
313 break; 312 break;
@@ -328,7 +327,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
328 skb_dst_set(skb, &rt->dst); 327 skb_dst_set(skb, &rt->dst);
329 skb->dev = dev; 328 skb->dev = dev;
330 329
331 skb_reserve(skb, hlen); 330 skb_reserve(skb, LL_RESERVED_SPACE(dev));
332 331
333 skb_reset_network_header(skb); 332 skb_reset_network_header(skb);
334 pip = ip_hdr(skb); 333 pip = ip_hdr(skb);
@@ -344,10 +343,10 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
344 pip->protocol = IPPROTO_IGMP; 343 pip->protocol = IPPROTO_IGMP;
345 pip->tot_len = 0; /* filled in later */ 344 pip->tot_len = 0; /* filled in later */
346 ip_select_ident(pip, &rt->dst, NULL); 345 ip_select_ident(pip, &rt->dst, NULL);
347 ((u8 *)&pip[1])[0] = IPOPT_RA; 346 ((u8*)&pip[1])[0] = IPOPT_RA;
348 ((u8 *)&pip[1])[1] = 4; 347 ((u8*)&pip[1])[1] = 4;
349 ((u8 *)&pip[1])[2] = 0; 348 ((u8*)&pip[1])[2] = 0;
350 ((u8 *)&pip[1])[3] = 0; 349 ((u8*)&pip[1])[3] = 0;
351 350
352 skb->transport_header = skb->network_header + sizeof(struct iphdr) + 4; 351 skb->transport_header = skb->network_header + sizeof(struct iphdr) + 4;
353 skb_put(skb, sizeof(*pig)); 352 skb_put(skb, sizeof(*pig));
@@ -648,7 +647,6 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
648 __be32 group = pmc ? pmc->multiaddr : 0; 647 __be32 group = pmc ? pmc->multiaddr : 0;
649 struct flowi4 fl4; 648 struct flowi4 fl4;
650 __be32 dst; 649 __be32 dst;
651 int hlen, tlen;
652 650
653 if (type == IGMPV3_HOST_MEMBERSHIP_REPORT) 651 if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)
654 return igmpv3_send_report(in_dev, pmc); 652 return igmpv3_send_report(in_dev, pmc);
@@ -663,9 +661,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
663 if (IS_ERR(rt)) 661 if (IS_ERR(rt))
664 return -1; 662 return -1;
665 663
666 hlen = LL_RESERVED_SPACE(dev); 664 skb = alloc_skb(IGMP_SIZE+LL_ALLOCATED_SPACE(dev), GFP_ATOMIC);
667 tlen = dev->needed_tailroom;
668 skb = alloc_skb(IGMP_SIZE + hlen + tlen, GFP_ATOMIC);
669 if (skb == NULL) { 665 if (skb == NULL) {
670 ip_rt_put(rt); 666 ip_rt_put(rt);
671 return -1; 667 return -1;
@@ -673,7 +669,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
673 669
674 skb_dst_set(skb, &rt->dst); 670 skb_dst_set(skb, &rt->dst);
675 671
676 skb_reserve(skb, hlen); 672 skb_reserve(skb, LL_RESERVED_SPACE(dev));
677 673
678 skb_reset_network_header(skb); 674 skb_reset_network_header(skb);
679 iph = ip_hdr(skb); 675 iph = ip_hdr(skb);
@@ -688,10 +684,10 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
688 iph->saddr = fl4.saddr; 684 iph->saddr = fl4.saddr;
689 iph->protocol = IPPROTO_IGMP; 685 iph->protocol = IPPROTO_IGMP;
690 ip_select_ident(iph, &rt->dst, NULL); 686 ip_select_ident(iph, &rt->dst, NULL);
691 ((u8 *)&iph[1])[0] = IPOPT_RA; 687 ((u8*)&iph[1])[0] = IPOPT_RA;
692 ((u8 *)&iph[1])[1] = 4; 688 ((u8*)&iph[1])[1] = 4;
693 ((u8 *)&iph[1])[2] = 0; 689 ((u8*)&iph[1])[2] = 0;
694 ((u8 *)&iph[1])[3] = 0; 690 ((u8*)&iph[1])[3] = 0;
695 691
696 ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr)); 692 ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
697 ih->type = type; 693 ih->type = type;
@@ -774,7 +770,7 @@ static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
774 if (psf->sf_count[MCAST_INCLUDE] || 770 if (psf->sf_count[MCAST_INCLUDE] ||
775 pmc->sfcount[MCAST_EXCLUDE] != 771 pmc->sfcount[MCAST_EXCLUDE] !=
776 psf->sf_count[MCAST_EXCLUDE]) 772 psf->sf_count[MCAST_EXCLUDE])
777 break; 773 continue;
778 if (srcs[i] == psf->sf_inaddr) { 774 if (srcs[i] == psf->sf_inaddr) {
779 scount++; 775 scount++;
780 break; 776 break;
@@ -815,15 +811,14 @@ static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
815 return 1; 811 return 1;
816} 812}
817 813
818/* return true if packet was dropped */ 814static void igmp_heard_report(struct in_device *in_dev, __be32 group)
819static bool igmp_heard_report(struct in_device *in_dev, __be32 group)
820{ 815{
821 struct ip_mc_list *im; 816 struct ip_mc_list *im;
822 817
823 /* Timers are only set for non-local groups */ 818 /* Timers are only set for non-local groups */
824 819
825 if (group == IGMP_ALL_HOSTS) 820 if (group == IGMP_ALL_HOSTS)
826 return false; 821 return;
827 822
828 rcu_read_lock(); 823 rcu_read_lock();
829 for_each_pmc_rcu(in_dev, im) { 824 for_each_pmc_rcu(in_dev, im) {
@@ -833,11 +828,9 @@ static bool igmp_heard_report(struct in_device *in_dev, __be32 group)
833 } 828 }
834 } 829 }
835 rcu_read_unlock(); 830 rcu_read_unlock();
836 return false;
837} 831}
838 832
839/* return true if packet was dropped */ 833static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
840static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
841 int len) 834 int len)
842{ 835{
843 struct igmphdr *ih = igmp_hdr(skb); 836 struct igmphdr *ih = igmp_hdr(skb);
@@ -869,7 +862,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
869 /* clear deleted report items */ 862 /* clear deleted report items */
870 igmpv3_clear_delrec(in_dev); 863 igmpv3_clear_delrec(in_dev);
871 } else if (len < 12) { 864 } else if (len < 12) {
872 return true; /* ignore bogus packet; freed by caller */ 865 return; /* ignore bogus packet; freed by caller */
873 } else if (IGMP_V1_SEEN(in_dev)) { 866 } else if (IGMP_V1_SEEN(in_dev)) {
874 /* This is a v3 query with v1 queriers present */ 867 /* This is a v3 query with v1 queriers present */
875 max_delay = IGMP_Query_Response_Interval; 868 max_delay = IGMP_Query_Response_Interval;
@@ -886,13 +879,13 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
886 max_delay = 1; /* can't mod w/ 0 */ 879 max_delay = 1; /* can't mod w/ 0 */
887 } else { /* v3 */ 880 } else { /* v3 */
888 if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) 881 if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
889 return true; 882 return;
890 883
891 ih3 = igmpv3_query_hdr(skb); 884 ih3 = igmpv3_query_hdr(skb);
892 if (ih3->nsrcs) { 885 if (ih3->nsrcs) {
893 if (!pskb_may_pull(skb, sizeof(struct igmpv3_query) 886 if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)
894 + ntohs(ih3->nsrcs)*sizeof(__be32))) 887 + ntohs(ih3->nsrcs)*sizeof(__be32)))
895 return true; 888 return;
896 ih3 = igmpv3_query_hdr(skb); 889 ih3 = igmpv3_query_hdr(skb);
897 } 890 }
898 891
@@ -904,9 +897,9 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
904 in_dev->mr_qrv = ih3->qrv; 897 in_dev->mr_qrv = ih3->qrv;
905 if (!group) { /* general query */ 898 if (!group) { /* general query */
906 if (ih3->nsrcs) 899 if (ih3->nsrcs)
907 return false; /* no sources allowed */ 900 return; /* no sources allowed */
908 igmp_gq_start_timer(in_dev); 901 igmp_gq_start_timer(in_dev);
909 return false; 902 return;
910 } 903 }
911 /* mark sources to include, if group & source-specific */ 904 /* mark sources to include, if group & source-specific */
912 mark = ih3->nsrcs != 0; 905 mark = ih3->nsrcs != 0;
@@ -942,7 +935,6 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
942 igmp_mod_timer(im, max_delay); 935 igmp_mod_timer(im, max_delay);
943 } 936 }
944 rcu_read_unlock(); 937 rcu_read_unlock();
945 return false;
946} 938}
947 939
948/* called in rcu_read_lock() section */ 940/* called in rcu_read_lock() section */
@@ -952,7 +944,6 @@ int igmp_rcv(struct sk_buff *skb)
952 struct igmphdr *ih; 944 struct igmphdr *ih;
953 struct in_device *in_dev = __in_dev_get_rcu(skb->dev); 945 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
954 int len = skb->len; 946 int len = skb->len;
955 bool dropped = true;
956 947
957 if (in_dev == NULL) 948 if (in_dev == NULL)
958 goto drop; 949 goto drop;
@@ -974,7 +965,7 @@ int igmp_rcv(struct sk_buff *skb)
974 ih = igmp_hdr(skb); 965 ih = igmp_hdr(skb);
975 switch (ih->type) { 966 switch (ih->type) {
976 case IGMP_HOST_MEMBERSHIP_QUERY: 967 case IGMP_HOST_MEMBERSHIP_QUERY:
977 dropped = igmp_heard_query(in_dev, skb, len); 968 igmp_heard_query(in_dev, skb, len);
978 break; 969 break;
979 case IGMP_HOST_MEMBERSHIP_REPORT: 970 case IGMP_HOST_MEMBERSHIP_REPORT:
980 case IGMPV2_HOST_MEMBERSHIP_REPORT: 971 case IGMPV2_HOST_MEMBERSHIP_REPORT:
@@ -984,7 +975,7 @@ int igmp_rcv(struct sk_buff *skb)
984 /* don't rely on MC router hearing unicast reports */ 975 /* don't rely on MC router hearing unicast reports */
985 if (skb->pkt_type == PACKET_MULTICAST || 976 if (skb->pkt_type == PACKET_MULTICAST ||
986 skb->pkt_type == PACKET_BROADCAST) 977 skb->pkt_type == PACKET_BROADCAST)
987 dropped = igmp_heard_report(in_dev, ih->group); 978 igmp_heard_report(in_dev, ih->group);
988 break; 979 break;
989 case IGMP_PIM: 980 case IGMP_PIM:
990#ifdef CONFIG_IP_PIMSM_V1 981#ifdef CONFIG_IP_PIMSM_V1
@@ -1002,10 +993,7 @@ int igmp_rcv(struct sk_buff *skb)
1002 } 993 }
1003 994
1004drop: 995drop:
1005 if (dropped) 996 kfree_skb(skb);
1006 kfree_skb(skb);
1007 else
1008 consume_skb(skb);
1009 return 0; 997 return 0;
1010} 998}
1011 999
@@ -1023,7 +1011,7 @@ static void ip_mc_filter_add(struct in_device *in_dev, __be32 addr)
1023 1011
1024 /* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG. 1012 /* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG.
1025 We will get multicast token leakage, when IFF_MULTICAST 1013 We will get multicast token leakage, when IFF_MULTICAST
1026 is changed. This check should be done in ndo_set_rx_mode 1014 is changed. This check should be done in dev->set_multicast_list
1027 routine. Something sort of: 1015 routine. Something sort of:
1028 if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; } 1016 if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; }
1029 --ANK 1017 --ANK
@@ -1588,7 +1576,7 @@ out_unlock:
1588 * Add multicast single-source filter to the interface list 1576 * Add multicast single-source filter to the interface list
1589 */ 1577 */
1590static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode, 1578static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode,
1591 __be32 *psfsrc) 1579 __be32 *psfsrc, int delta)
1592{ 1580{
1593 struct ip_sf_list *psf, *psf_prev; 1581 struct ip_sf_list *psf, *psf_prev;
1594 1582
@@ -1723,15 +1711,14 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
1723 pmc->sfcount[sfmode]++; 1711 pmc->sfcount[sfmode]++;
1724 err = 0; 1712 err = 0;
1725 for (i=0; i<sfcount; i++) { 1713 for (i=0; i<sfcount; i++) {
1726 err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i]); 1714 err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i], delta);
1727 if (err) 1715 if (err)
1728 break; 1716 break;
1729 } 1717 }
1730 if (err) { 1718 if (err) {
1731 int j; 1719 int j;
1732 1720
1733 if (!delta) 1721 pmc->sfcount[sfmode]--;
1734 pmc->sfcount[sfmode]--;
1735 for (j=0; j<i; j++) 1722 for (j=0; j<i; j++)
1736 (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[j]); 1723 (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[j]);
1737 } else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) { 1724 } else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) {
@@ -1850,7 +1837,7 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
1850 } 1837 }
1851 err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, 1838 err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
1852 iml->sfmode, psf->sl_count, psf->sl_addr, 0); 1839 iml->sfmode, psf->sl_count, psf->sl_addr, 0);
1853 RCU_INIT_POINTER(iml->sflist, NULL); 1840 rcu_assign_pointer(iml->sflist, NULL);
1854 /* decrease mem now to avoid the memleak warning */ 1841 /* decrease mem now to avoid the memleak warning */
1855 atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc); 1842 atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc);
1856 kfree_rcu(psf, rcu); 1843 kfree_rcu(psf, rcu);
@@ -1904,7 +1891,6 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
1904 rtnl_unlock(); 1891 rtnl_unlock();
1905 return ret; 1892 return ret;
1906} 1893}
1907EXPORT_SYMBOL(ip_mc_leave_group);
1908 1894
1909int ip_mc_source(int add, int omode, struct sock *sk, struct 1895int ip_mc_source(int add, int omode, struct sock *sk, struct
1910 ip_mreq_source *mreqs, int ifindex) 1896 ip_mreq_source *mreqs, int ifindex)
@@ -2444,8 +2430,6 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
2444 struct ip_mc_list *im = (struct ip_mc_list *)v; 2430 struct ip_mc_list *im = (struct ip_mc_list *)v;
2445 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); 2431 struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
2446 char *querier; 2432 char *querier;
2447 long delta;
2448
2449#ifdef CONFIG_IP_MULTICAST 2433#ifdef CONFIG_IP_MULTICAST
2450 querier = IGMP_V1_SEEN(state->in_dev) ? "V1" : 2434 querier = IGMP_V1_SEEN(state->in_dev) ? "V1" :
2451 IGMP_V2_SEEN(state->in_dev) ? "V2" : 2435 IGMP_V2_SEEN(state->in_dev) ? "V2" :
@@ -2459,12 +2443,11 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
2459 state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier); 2443 state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier);
2460 } 2444 }
2461 2445
2462 delta = im->timer.expires - jiffies;
2463 seq_printf(seq, 2446 seq_printf(seq,
2464 "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n", 2447 "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n",
2465 im->multiaddr, im->users, 2448 im->multiaddr, im->users,
2466 im->tm_running, 2449 im->tm_running, im->tm_running ?
2467 im->tm_running ? jiffies_delta_to_clock_t(delta) : 0, 2450 jiffies_to_clock_t(im->timer.expires-jiffies) : 0,
2468 im->reporter); 2451 im->reporter);
2469 } 2452 }
2470 return 0; 2453 return 0;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index d0670f00d52..c14d88ad348 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -42,8 +42,7 @@ EXPORT_SYMBOL(sysctl_local_reserved_ports);
42 42
43void inet_get_local_port_range(int *low, int *high) 43void inet_get_local_port_range(int *low, int *high)
44{ 44{
45 unsigned int seq; 45 unsigned seq;
46
47 do { 46 do {
48 seq = read_seqbegin(&sysctl_local_ports.lock); 47 seq = read_seqbegin(&sysctl_local_ports.lock);
49 48
@@ -54,7 +53,7 @@ void inet_get_local_port_range(int *low, int *high)
54EXPORT_SYMBOL(inet_get_local_port_range); 53EXPORT_SYMBOL(inet_get_local_port_range);
55 54
56int inet_csk_bind_conflict(const struct sock *sk, 55int inet_csk_bind_conflict(const struct sock *sk,
57 const struct inet_bind_bucket *tb, bool relax) 56 const struct inet_bind_bucket *tb)
58{ 57{
59 struct sock *sk2; 58 struct sock *sk2;
60 struct hlist_node *node; 59 struct hlist_node *node;
@@ -80,14 +79,6 @@ int inet_csk_bind_conflict(const struct sock *sk,
80 sk2_rcv_saddr == sk_rcv_saddr(sk)) 79 sk2_rcv_saddr == sk_rcv_saddr(sk))
81 break; 80 break;
82 } 81 }
83 if (!relax && reuse && sk2->sk_reuse &&
84 sk2->sk_state != TCP_LISTEN) {
85 const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
86
87 if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
88 sk2_rcv_saddr == sk_rcv_saddr(sk))
89 break;
90 }
91 } 82 }
92 } 83 }
93 return node != NULL; 84 return node != NULL;
@@ -131,16 +122,12 @@ again:
131 (tb->num_owners < smallest_size || smallest_size == -1)) { 122 (tb->num_owners < smallest_size || smallest_size == -1)) {
132 smallest_size = tb->num_owners; 123 smallest_size = tb->num_owners;
133 smallest_rover = rover; 124 smallest_rover = rover;
134 if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 && 125 if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) {
135 !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) { 126 spin_unlock(&head->lock);
136 snum = smallest_rover; 127 snum = smallest_rover;
137 goto tb_found; 128 goto have_snum;
138 } 129 }
139 } 130 }
140 if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
141 snum = rover;
142 goto tb_found;
143 }
144 goto next; 131 goto next;
145 } 132 }
146 break; 133 break;
@@ -182,22 +169,18 @@ have_snum:
182 goto tb_not_found; 169 goto tb_not_found;
183tb_found: 170tb_found:
184 if (!hlist_empty(&tb->owners)) { 171 if (!hlist_empty(&tb->owners)) {
185 if (sk->sk_reuse == SK_FORCE_REUSE)
186 goto success;
187
188 if (tb->fastreuse > 0 && 172 if (tb->fastreuse > 0 &&
189 sk->sk_reuse && sk->sk_state != TCP_LISTEN && 173 sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
190 smallest_size == -1) { 174 smallest_size == -1) {
191 goto success; 175 goto success;
192 } else { 176 } else {
193 ret = 1; 177 ret = 1;
194 if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { 178 if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) {
195 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN && 179 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
196 smallest_size != -1 && --attempts >= 0) { 180 smallest_size != -1 && --attempts >= 0) {
197 spin_unlock(&head->lock); 181 spin_unlock(&head->lock);
198 goto again; 182 goto again;
199 } 183 }
200
201 goto fail_unlock; 184 goto fail_unlock;
202 } 185 }
203 } 186 }
@@ -283,9 +266,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
283struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) 266struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
284{ 267{
285 struct inet_connection_sock *icsk = inet_csk(sk); 268 struct inet_connection_sock *icsk = inet_csk(sk);
286 struct request_sock_queue *queue = &icsk->icsk_accept_queue;
287 struct sock *newsk; 269 struct sock *newsk;
288 struct request_sock *req;
289 int error; 270 int error;
290 271
291 lock_sock(sk); 272 lock_sock(sk);
@@ -298,7 +279,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
298 goto out_err; 279 goto out_err;
299 280
300 /* Find already established connection */ 281 /* Find already established connection */
301 if (reqsk_queue_empty(queue)) { 282 if (reqsk_queue_empty(&icsk->icsk_accept_queue)) {
302 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); 283 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
303 284
304 /* If this is a non blocking socket don't sleep */ 285 /* If this is a non blocking socket don't sleep */
@@ -310,32 +291,14 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
310 if (error) 291 if (error)
311 goto out_err; 292 goto out_err;
312 } 293 }
313 req = reqsk_queue_remove(queue); 294
314 newsk = req->sk; 295 newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
315 296 WARN_ON(newsk->sk_state == TCP_SYN_RECV);
316 sk_acceptq_removed(sk);
317 if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL) {
318 spin_lock_bh(&queue->fastopenq->lock);
319 if (tcp_rsk(req)->listener) {
320 /* We are still waiting for the final ACK from 3WHS
321 * so can't free req now. Instead, we set req->sk to
322 * NULL to signify that the child socket is taken
323 * so reqsk_fastopen_remove() will free the req
324 * when 3WHS finishes (or is aborted).
325 */
326 req->sk = NULL;
327 req = NULL;
328 }
329 spin_unlock_bh(&queue->fastopenq->lock);
330 }
331out: 297out:
332 release_sock(sk); 298 release_sock(sk);
333 if (req)
334 __reqsk_free(req);
335 return newsk; 299 return newsk;
336out_err: 300out_err:
337 newsk = NULL; 301 newsk = NULL;
338 req = NULL;
339 *err = error; 302 *err = error;
340 goto out; 303 goto out;
341} 304}
@@ -394,19 +357,17 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
394 const struct inet_request_sock *ireq = inet_rsk(req); 357 const struct inet_request_sock *ireq = inet_rsk(req);
395 struct ip_options_rcu *opt = inet_rsk(req)->opt; 358 struct ip_options_rcu *opt = inet_rsk(req)->opt;
396 struct net *net = sock_net(sk); 359 struct net *net = sock_net(sk);
397 int flags = inet_sk_flowi_flags(sk);
398 360
399 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, 361 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
400 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, 362 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
401 sk->sk_protocol, 363 sk->sk_protocol, inet_sk_flowi_flags(sk),
402 flags,
403 (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr, 364 (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr,
404 ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport); 365 ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport);
405 security_req_classify_flow(req, flowi4_to_flowi(fl4)); 366 security_req_classify_flow(req, flowi4_to_flowi(fl4));
406 rt = ip_route_output_flow(net, fl4, sk); 367 rt = ip_route_output_flow(net, fl4, sk);
407 if (IS_ERR(rt)) 368 if (IS_ERR(rt))
408 goto no_route; 369 goto no_route;
409 if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) 370 if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
410 goto route_err; 371 goto route_err;
411 return &rt->dst; 372 return &rt->dst;
412 373
@@ -424,15 +385,12 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
424{ 385{
425 const struct inet_request_sock *ireq = inet_rsk(req); 386 const struct inet_request_sock *ireq = inet_rsk(req);
426 struct inet_sock *newinet = inet_sk(newsk); 387 struct inet_sock *newinet = inet_sk(newsk);
427 struct ip_options_rcu *opt; 388 struct ip_options_rcu *opt = ireq->opt;
428 struct net *net = sock_net(sk); 389 struct net *net = sock_net(sk);
429 struct flowi4 *fl4; 390 struct flowi4 *fl4;
430 struct rtable *rt; 391 struct rtable *rt;
431 392
432 fl4 = &newinet->cork.fl.u.ip4; 393 fl4 = &newinet->cork.fl.u.ip4;
433
434 rcu_read_lock();
435 opt = rcu_dereference(newinet->inet_opt);
436 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, 394 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
437 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, 395 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
438 sk->sk_protocol, inet_sk_flowi_flags(sk), 396 sk->sk_protocol, inet_sk_flowi_flags(sk),
@@ -442,15 +400,13 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
442 rt = ip_route_output_flow(net, fl4, sk); 400 rt = ip_route_output_flow(net, fl4, sk);
443 if (IS_ERR(rt)) 401 if (IS_ERR(rt))
444 goto no_route; 402 goto no_route;
445 if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) 403 if (opt && opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
446 goto route_err; 404 goto route_err;
447 rcu_read_unlock();
448 return &rt->dst; 405 return &rt->dst;
449 406
450route_err: 407route_err:
451 ip_rt_put(rt); 408 ip_rt_put(rt);
452no_route: 409no_route:
453 rcu_read_unlock();
454 IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); 410 IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
455 return NULL; 411 return NULL;
456} 412}
@@ -462,7 +418,7 @@ static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
462 return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1); 418 return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
463} 419}
464 420
465#if IS_ENABLED(CONFIG_IPV6) 421#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
466#define AF_INET_FAMILY(fam) ((fam) == AF_INET) 422#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
467#else 423#else
468#define AF_INET_FAMILY(fam) 1 424#define AF_INET_FAMILY(fam) 1
@@ -521,31 +477,21 @@ static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
521 int *expire, int *resend) 477 int *expire, int *resend)
522{ 478{
523 if (!rskq_defer_accept) { 479 if (!rskq_defer_accept) {
524 *expire = req->num_timeout >= thresh; 480 *expire = req->retrans >= thresh;
525 *resend = 1; 481 *resend = 1;
526 return; 482 return;
527 } 483 }
528 *expire = req->num_timeout >= thresh && 484 *expire = req->retrans >= thresh &&
529 (!inet_rsk(req)->acked || req->num_timeout >= max_retries); 485 (!inet_rsk(req)->acked || req->retrans >= max_retries);
530 /* 486 /*
531 * Do not resend while waiting for data after ACK, 487 * Do not resend while waiting for data after ACK,
532 * start to resend on end of deferring period to give 488 * start to resend on end of deferring period to give
533 * last chance for data or ACK to create established socket. 489 * last chance for data or ACK to create established socket.
534 */ 490 */
535 *resend = !inet_rsk(req)->acked || 491 *resend = !inet_rsk(req)->acked ||
536 req->num_timeout >= rskq_defer_accept - 1; 492 req->retrans >= rskq_defer_accept - 1;
537} 493}
538 494
539int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req)
540{
541 int err = req->rsk_ops->rtx_syn_ack(parent, req, NULL);
542
543 if (!err)
544 req->num_retrans++;
545 return err;
546}
547EXPORT_SYMBOL(inet_rtx_syn_ack);
548
549void inet_csk_reqsk_queue_prune(struct sock *parent, 495void inet_csk_reqsk_queue_prune(struct sock *parent,
550 const unsigned long interval, 496 const unsigned long interval,
551 const unsigned long timeout, 497 const unsigned long timeout,
@@ -565,7 +511,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
565 511
566 /* Normally all the openreqs are young and become mature 512 /* Normally all the openreqs are young and become mature
567 * (i.e. converted to established socket) for first timeout. 513 * (i.e. converted to established socket) for first timeout.
568 * If synack was not acknowledged for 1 second, it means 514 * If synack was not acknowledged for 3 seconds, it means
569 * one of the following things: synack was lost, ack was lost, 515 * one of the following things: synack was lost, ack was lost,
570 * rtt is high or nobody planned to ack (i.e. synflood). 516 * rtt is high or nobody planned to ack (i.e. synflood).
571 * When server is a bit loaded, queue is populated with old 517 * When server is a bit loaded, queue is populated with old
@@ -606,17 +552,17 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
606 syn_ack_recalc(req, thresh, max_retries, 552 syn_ack_recalc(req, thresh, max_retries,
607 queue->rskq_defer_accept, 553 queue->rskq_defer_accept,
608 &expire, &resend); 554 &expire, &resend);
609 req->rsk_ops->syn_ack_timeout(parent, req); 555 if (req->rsk_ops->syn_ack_timeout)
556 req->rsk_ops->syn_ack_timeout(parent, req);
610 if (!expire && 557 if (!expire &&
611 (!resend || 558 (!resend ||
612 !inet_rtx_syn_ack(parent, req) || 559 !req->rsk_ops->rtx_syn_ack(parent, req, NULL) ||
613 inet_rsk(req)->acked)) { 560 inet_rsk(req)->acked)) {
614 unsigned long timeo; 561 unsigned long timeo;
615 562
616 if (req->num_timeout++ == 0) 563 if (req->retrans++ == 0)
617 lopt->qlen_young--; 564 lopt->qlen_young--;
618 timeo = min(timeout << req->num_timeout, 565 timeo = min((timeout << req->retrans), max_rto);
619 max_rto);
620 req->expires = now + timeo; 566 req->expires = now + timeo;
621 reqp = &req->dl_next; 567 reqp = &req->dl_next;
622 continue; 568 continue;
@@ -642,19 +588,10 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
642} 588}
643EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune); 589EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
644 590
645/** 591struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
646 * inet_csk_clone_lock - clone an inet socket, and lock its clone 592 const gfp_t priority)
647 * @sk: the socket to clone
648 * @req: request_sock
649 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
650 *
651 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
652 */
653struct sock *inet_csk_clone_lock(const struct sock *sk,
654 const struct request_sock *req,
655 const gfp_t priority)
656{ 593{
657 struct sock *newsk = sk_clone_lock(sk, priority); 594 struct sock *newsk = sk_clone(sk, priority);
658 595
659 if (newsk != NULL) { 596 if (newsk != NULL) {
660 struct inet_connection_sock *newicsk = inet_csk(newsk); 597 struct inet_connection_sock *newicsk = inet_csk(newsk);
@@ -678,7 +615,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
678 } 615 }
679 return newsk; 616 return newsk;
680} 617}
681EXPORT_SYMBOL_GPL(inet_csk_clone_lock); 618EXPORT_SYMBOL_GPL(inet_csk_clone);
682 619
683/* 620/*
684 * At this point, there should be no process reference to this 621 * At this point, there should be no process reference to this
@@ -710,22 +647,6 @@ void inet_csk_destroy_sock(struct sock *sk)
710} 647}
711EXPORT_SYMBOL(inet_csk_destroy_sock); 648EXPORT_SYMBOL(inet_csk_destroy_sock);
712 649
713/* This function allows to force a closure of a socket after the call to
714 * tcp/dccp_create_openreq_child().
715 */
716void inet_csk_prepare_forced_close(struct sock *sk)
717{
718 /* sk_clone_lock locked the socket and set refcnt to 2 */
719 bh_unlock_sock(sk);
720 sock_put(sk);
721
722 /* The below has to be done to allow calling inet_csk_destroy_sock */
723 sock_set_flag(sk, SOCK_DEAD);
724 percpu_counter_inc(sk->sk_prot->orphan_count);
725 inet_sk(sk)->inet_num = 0;
726}
727EXPORT_SYMBOL(inet_csk_prepare_forced_close);
728
729int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) 650int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
730{ 651{
731 struct inet_sock *inet = inet_sk(sk); 652 struct inet_sock *inet = inet_sk(sk);
@@ -767,14 +688,13 @@ EXPORT_SYMBOL_GPL(inet_csk_listen_start);
767void inet_csk_listen_stop(struct sock *sk) 688void inet_csk_listen_stop(struct sock *sk)
768{ 689{
769 struct inet_connection_sock *icsk = inet_csk(sk); 690 struct inet_connection_sock *icsk = inet_csk(sk);
770 struct request_sock_queue *queue = &icsk->icsk_accept_queue;
771 struct request_sock *acc_req; 691 struct request_sock *acc_req;
772 struct request_sock *req; 692 struct request_sock *req;
773 693
774 inet_csk_delete_keepalive_timer(sk); 694 inet_csk_delete_keepalive_timer(sk);
775 695
776 /* make all the listen_opt local to us */ 696 /* make all the listen_opt local to us */
777 acc_req = reqsk_queue_yank_acceptq(queue); 697 acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue);
778 698
779 /* Following specs, it would be better either to send FIN 699 /* Following specs, it would be better either to send FIN
780 * (and enter FIN-WAIT-1, it is normal close) 700 * (and enter FIN-WAIT-1, it is normal close)
@@ -784,7 +704,7 @@ void inet_csk_listen_stop(struct sock *sk)
784 * To be honest, we are not able to make either 704 * To be honest, we are not able to make either
785 * of the variants now. --ANK 705 * of the variants now. --ANK
786 */ 706 */
787 reqsk_queue_destroy(queue); 707 reqsk_queue_destroy(&icsk->icsk_accept_queue);
788 708
789 while ((req = acc_req) != NULL) { 709 while ((req = acc_req) != NULL) {
790 struct sock *child = req->sk; 710 struct sock *child = req->sk;
@@ -802,19 +722,6 @@ void inet_csk_listen_stop(struct sock *sk)
802 722
803 percpu_counter_inc(sk->sk_prot->orphan_count); 723 percpu_counter_inc(sk->sk_prot->orphan_count);
804 724
805 if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->listener) {
806 BUG_ON(tcp_sk(child)->fastopen_rsk != req);
807 BUG_ON(sk != tcp_rsk(req)->listener);
808
809 /* Paranoid, to prevent race condition if
810 * an inbound pkt destined for child is
811 * blocked by sock lock in tcp_v4_rcv().
812 * Also to satisfy an assertion in
813 * tcp_v4_destroy_sock().
814 */
815 tcp_sk(child)->fastopen_rsk = NULL;
816 sock_put(sk);
817 }
818 inet_csk_destroy_sock(child); 725 inet_csk_destroy_sock(child);
819 726
820 bh_unlock_sock(child); 727 bh_unlock_sock(child);
@@ -824,17 +731,6 @@ void inet_csk_listen_stop(struct sock *sk)
824 sk_acceptq_removed(sk); 731 sk_acceptq_removed(sk);
825 __reqsk_free(req); 732 __reqsk_free(req);
826 } 733 }
827 if (queue->fastopenq != NULL) {
828 /* Free all the reqs queued in rskq_rst_head. */
829 spin_lock_bh(&queue->fastopenq->lock);
830 acc_req = queue->fastopenq->rskq_rst_head;
831 queue->fastopenq->rskq_rst_head = NULL;
832 spin_unlock_bh(&queue->fastopenq->lock);
833 while ((req = acc_req) != NULL) {
834 acc_req = req->dl_next;
835 __reqsk_free(req);
836 }
837 }
838 WARN_ON(sk->sk_ack_backlog); 734 WARN_ON(sk->sk_ack_backlog);
839} 735}
840EXPORT_SYMBOL_GPL(inet_csk_listen_stop); 736EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
@@ -877,49 +773,3 @@ int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
877} 773}
878EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt); 774EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt);
879#endif 775#endif
880
881static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl)
882{
883 const struct inet_sock *inet = inet_sk(sk);
884 const struct ip_options_rcu *inet_opt;
885 __be32 daddr = inet->inet_daddr;
886 struct flowi4 *fl4;
887 struct rtable *rt;
888
889 rcu_read_lock();
890 inet_opt = rcu_dereference(inet->inet_opt);
891 if (inet_opt && inet_opt->opt.srr)
892 daddr = inet_opt->opt.faddr;
893 fl4 = &fl->u.ip4;
894 rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr,
895 inet->inet_saddr, inet->inet_dport,
896 inet->inet_sport, sk->sk_protocol,
897 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
898 if (IS_ERR(rt))
899 rt = NULL;
900 if (rt)
901 sk_setup_caps(sk, &rt->dst);
902 rcu_read_unlock();
903
904 return &rt->dst;
905}
906
907struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu)
908{
909 struct dst_entry *dst = __sk_dst_check(sk, 0);
910 struct inet_sock *inet = inet_sk(sk);
911
912 if (!dst) {
913 dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
914 if (!dst)
915 goto out;
916 }
917 dst->ops->update_pmtu(dst, sk, NULL, mtu);
918
919 dst = __sk_dst_check(sk, 0);
920 if (!dst)
921 dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
922out:
923 return dst;
924}
925EXPORT_SYMBOL_GPL(inet_csk_update_pmtu);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 7afa2c3c788..389a2e6a17f 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -33,7 +33,6 @@
33#include <linux/stddef.h> 33#include <linux/stddef.h>
34 34
35#include <linux/inet_diag.h> 35#include <linux/inet_diag.h>
36#include <linux/sock_diag.h>
37 36
38static const struct inet_diag_handler **inet_diag_table; 37static const struct inet_diag_handler **inet_diag_table;
39 38
@@ -44,25 +43,26 @@ struct inet_diag_entry {
44 u16 dport; 43 u16 dport;
45 u16 family; 44 u16 family;
46 u16 userlocks; 45 u16 userlocks;
47#if IS_ENABLED(CONFIG_IPV6)
48 struct in6_addr saddr_storage; /* for IPv4-mapped-IPv6 addresses */
49 struct in6_addr daddr_storage; /* for IPv4-mapped-IPv6 addresses */
50#endif
51}; 46};
52 47
48static struct sock *idiagnl;
49
50#define INET_DIAG_PUT(skb, attrtype, attrlen) \
51 RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
52
53static DEFINE_MUTEX(inet_diag_table_mutex); 53static DEFINE_MUTEX(inet_diag_table_mutex);
54 54
55static const struct inet_diag_handler *inet_diag_lock_handler(int proto) 55static const struct inet_diag_handler *inet_diag_lock_handler(int type)
56{ 56{
57 if (!inet_diag_table[proto]) 57 if (!inet_diag_table[type])
58 request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, 58 request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
59 NETLINK_SOCK_DIAG, AF_INET, proto); 59 NETLINK_INET_DIAG, type);
60 60
61 mutex_lock(&inet_diag_table_mutex); 61 mutex_lock(&inet_diag_table_mutex);
62 if (!inet_diag_table[proto]) 62 if (!inet_diag_table[type])
63 return ERR_PTR(-ENOENT); 63 return ERR_PTR(-ENOENT);
64 64
65 return inet_diag_table[proto]; 65 return inet_diag_table[type];
66} 66}
67 67
68static inline void inet_diag_unlock_handler( 68static inline void inet_diag_unlock_handler(
@@ -71,91 +71,68 @@ static inline void inet_diag_unlock_handler(
71 mutex_unlock(&inet_diag_table_mutex); 71 mutex_unlock(&inet_diag_table_mutex);
72} 72}
73 73
74int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, 74static int inet_csk_diag_fill(struct sock *sk,
75 struct sk_buff *skb, struct inet_diag_req_v2 *req, 75 struct sk_buff *skb,
76 struct user_namespace *user_ns, 76 int ext, u32 pid, u32 seq, u16 nlmsg_flags,
77 u32 portid, u32 seq, u16 nlmsg_flags,
78 const struct nlmsghdr *unlh) 77 const struct nlmsghdr *unlh)
79{ 78{
80 const struct inet_sock *inet = inet_sk(sk); 79 const struct inet_sock *inet = inet_sk(sk);
80 const struct inet_connection_sock *icsk = inet_csk(sk);
81 struct inet_diag_msg *r; 81 struct inet_diag_msg *r;
82 struct nlmsghdr *nlh; 82 struct nlmsghdr *nlh;
83 struct nlattr *attr;
84 void *info = NULL; 83 void *info = NULL;
84 struct inet_diag_meminfo *minfo = NULL;
85 unsigned char *b = skb_tail_pointer(skb);
85 const struct inet_diag_handler *handler; 86 const struct inet_diag_handler *handler;
86 int ext = req->idiag_ext;
87 87
88 handler = inet_diag_table[req->sdiag_protocol]; 88 handler = inet_diag_table[unlh->nlmsg_type];
89 BUG_ON(handler == NULL); 89 BUG_ON(handler == NULL);
90 90
91 nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), 91 nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
92 nlmsg_flags); 92 nlh->nlmsg_flags = nlmsg_flags;
93 if (!nlh)
94 return -EMSGSIZE;
95 93
96 r = nlmsg_data(nlh); 94 r = NLMSG_DATA(nlh);
97 BUG_ON(sk->sk_state == TCP_TIME_WAIT); 95 BUG_ON(sk->sk_state == TCP_TIME_WAIT);
98 96
97 if (ext & (1 << (INET_DIAG_MEMINFO - 1)))
98 minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO, sizeof(*minfo));
99
100 if (ext & (1 << (INET_DIAG_INFO - 1)))
101 info = INET_DIAG_PUT(skb, INET_DIAG_INFO,
102 handler->idiag_info_size);
103
104 if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) {
105 const size_t len = strlen(icsk->icsk_ca_ops->name);
106
107 strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1),
108 icsk->icsk_ca_ops->name);
109 }
110
99 r->idiag_family = sk->sk_family; 111 r->idiag_family = sk->sk_family;
100 r->idiag_state = sk->sk_state; 112 r->idiag_state = sk->sk_state;
101 r->idiag_timer = 0; 113 r->idiag_timer = 0;
102 r->idiag_retrans = 0; 114 r->idiag_retrans = 0;
103 115
104 r->id.idiag_if = sk->sk_bound_dev_if; 116 r->id.idiag_if = sk->sk_bound_dev_if;
105 sock_diag_save_cookie(sk, r->id.idiag_cookie); 117 r->id.idiag_cookie[0] = (u32)(unsigned long)sk;
118 r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
106 119
107 r->id.idiag_sport = inet->inet_sport; 120 r->id.idiag_sport = inet->inet_sport;
108 r->id.idiag_dport = inet->inet_dport; 121 r->id.idiag_dport = inet->inet_dport;
109 r->id.idiag_src[0] = inet->inet_rcv_saddr; 122 r->id.idiag_src[0] = inet->inet_rcv_saddr;
110 r->id.idiag_dst[0] = inet->inet_daddr; 123 r->id.idiag_dst[0] = inet->inet_daddr;
111 124
112 if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown)) 125#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
113 goto errout;
114
115 /* IPv6 dual-stack sockets use inet->tos for IPv4 connections,
116 * hence this needs to be included regardless of socket family.
117 */
118 if (ext & (1 << (INET_DIAG_TOS - 1)))
119 if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0)
120 goto errout;
121
122#if IS_ENABLED(CONFIG_IPV6)
123 if (r->idiag_family == AF_INET6) { 126 if (r->idiag_family == AF_INET6) {
124 const struct ipv6_pinfo *np = inet6_sk(sk); 127 const struct ipv6_pinfo *np = inet6_sk(sk);
125 128
126 *(struct in6_addr *)r->id.idiag_src = np->rcv_saddr; 129 ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
127 *(struct in6_addr *)r->id.idiag_dst = np->daddr; 130 &np->rcv_saddr);
128 131 ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
129 if (ext & (1 << (INET_DIAG_TCLASS - 1))) 132 &np->daddr);
130 if (nla_put_u8(skb, INET_DIAG_TCLASS, np->tclass) < 0)
131 goto errout;
132 } 133 }
133#endif 134#endif
134 135
135 r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
136 r->idiag_inode = sock_i_ino(sk);
137
138 if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
139 struct inet_diag_meminfo minfo = {
140 .idiag_rmem = sk_rmem_alloc_get(sk),
141 .idiag_wmem = sk->sk_wmem_queued,
142 .idiag_fmem = sk->sk_forward_alloc,
143 .idiag_tmem = sk_wmem_alloc_get(sk),
144 };
145
146 if (nla_put(skb, INET_DIAG_MEMINFO, sizeof(minfo), &minfo) < 0)
147 goto errout;
148 }
149
150 if (ext & (1 << (INET_DIAG_SKMEMINFO - 1)))
151 if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO))
152 goto errout;
153
154 if (icsk == NULL) {
155 handler->idiag_get_info(sk, r, NULL);
156 goto out;
157 }
158
159#define EXPIRES_IN_MS(tmo) DIV_ROUND_UP((tmo - jiffies) * 1000, HZ) 136#define EXPIRES_IN_MS(tmo) DIV_ROUND_UP((tmo - jiffies) * 1000, HZ)
160 137
161 if (icsk->icsk_pending == ICSK_TIME_RETRANS) { 138 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
@@ -176,62 +153,47 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
176 } 153 }
177#undef EXPIRES_IN_MS 154#undef EXPIRES_IN_MS
178 155
179 if (ext & (1 << (INET_DIAG_INFO - 1))) { 156 r->idiag_uid = sock_i_uid(sk);
180 attr = nla_reserve(skb, INET_DIAG_INFO, 157 r->idiag_inode = sock_i_ino(sk);
181 sizeof(struct tcp_info));
182 if (!attr)
183 goto errout;
184 158
185 info = nla_data(attr); 159 if (minfo) {
160 minfo->idiag_rmem = sk_rmem_alloc_get(sk);
161 minfo->idiag_wmem = sk->sk_wmem_queued;
162 minfo->idiag_fmem = sk->sk_forward_alloc;
163 minfo->idiag_tmem = sk_wmem_alloc_get(sk);
186 } 164 }
187 165
188 if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops)
189 if (nla_put_string(skb, INET_DIAG_CONG,
190 icsk->icsk_ca_ops->name) < 0)
191 goto errout;
192
193 handler->idiag_get_info(sk, r, info); 166 handler->idiag_get_info(sk, r, info);
194 167
195 if (sk->sk_state < TCP_TIME_WAIT && 168 if (sk->sk_state < TCP_TIME_WAIT &&
196 icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info) 169 icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info)
197 icsk->icsk_ca_ops->get_info(sk, ext, skb); 170 icsk->icsk_ca_ops->get_info(sk, ext, skb);
198 171
199out: 172 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
200 return nlmsg_end(skb, nlh); 173 return skb->len;
201 174
202errout: 175rtattr_failure:
203 nlmsg_cancel(skb, nlh); 176nlmsg_failure:
177 nlmsg_trim(skb, b);
204 return -EMSGSIZE; 178 return -EMSGSIZE;
205} 179}
206EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
207
208static int inet_csk_diag_fill(struct sock *sk,
209 struct sk_buff *skb, struct inet_diag_req_v2 *req,
210 struct user_namespace *user_ns,
211 u32 portid, u32 seq, u16 nlmsg_flags,
212 const struct nlmsghdr *unlh)
213{
214 return inet_sk_diag_fill(sk, inet_csk(sk),
215 skb, req, user_ns, portid, seq, nlmsg_flags, unlh);
216}
217 180
218static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, 181static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
219 struct sk_buff *skb, struct inet_diag_req_v2 *req, 182 struct sk_buff *skb, int ext, u32 pid,
220 u32 portid, u32 seq, u16 nlmsg_flags, 183 u32 seq, u16 nlmsg_flags,
221 const struct nlmsghdr *unlh) 184 const struct nlmsghdr *unlh)
222{ 185{
223 long tmo; 186 long tmo;
224 struct inet_diag_msg *r; 187 struct inet_diag_msg *r;
225 struct nlmsghdr *nlh; 188 const unsigned char *previous_tail = skb_tail_pointer(skb);
226 189 struct nlmsghdr *nlh = NLMSG_PUT(skb, pid, seq,
227 nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), 190 unlh->nlmsg_type, sizeof(*r));
228 nlmsg_flags);
229 if (!nlh)
230 return -EMSGSIZE;
231 191
232 r = nlmsg_data(nlh); 192 r = NLMSG_DATA(nlh);
233 BUG_ON(tw->tw_state != TCP_TIME_WAIT); 193 BUG_ON(tw->tw_state != TCP_TIME_WAIT);
234 194
195 nlh->nlmsg_flags = nlmsg_flags;
196
235 tmo = tw->tw_ttd - jiffies; 197 tmo = tw->tw_ttd - jiffies;
236 if (tmo < 0) 198 if (tmo < 0)
237 tmo = 0; 199 tmo = 0;
@@ -239,7 +201,8 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
239 r->idiag_family = tw->tw_family; 201 r->idiag_family = tw->tw_family;
240 r->idiag_retrans = 0; 202 r->idiag_retrans = 0;
241 r->id.idiag_if = tw->tw_bound_dev_if; 203 r->id.idiag_if = tw->tw_bound_dev_if;
242 sock_diag_save_cookie(tw, r->id.idiag_cookie); 204 r->id.idiag_cookie[0] = (u32)(unsigned long)tw;
205 r->id.idiag_cookie[1] = (u32)(((unsigned long)tw >> 31) >> 1);
243 r->id.idiag_sport = tw->tw_sport; 206 r->id.idiag_sport = tw->tw_sport;
244 r->id.idiag_dport = tw->tw_dport; 207 r->id.idiag_dport = tw->tw_dport;
245 r->id.idiag_src[0] = tw->tw_rcv_saddr; 208 r->id.idiag_src[0] = tw->tw_rcv_saddr;
@@ -251,49 +214,62 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
251 r->idiag_wqueue = 0; 214 r->idiag_wqueue = 0;
252 r->idiag_uid = 0; 215 r->idiag_uid = 0;
253 r->idiag_inode = 0; 216 r->idiag_inode = 0;
254#if IS_ENABLED(CONFIG_IPV6) 217#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
255 if (tw->tw_family == AF_INET6) { 218 if (tw->tw_family == AF_INET6) {
256 const struct inet6_timewait_sock *tw6 = 219 const struct inet6_timewait_sock *tw6 =
257 inet6_twsk((struct sock *)tw); 220 inet6_twsk((struct sock *)tw);
258 221
259 *(struct in6_addr *)r->id.idiag_src = tw6->tw_v6_rcv_saddr; 222 ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
260 *(struct in6_addr *)r->id.idiag_dst = tw6->tw_v6_daddr; 223 &tw6->tw_v6_rcv_saddr);
224 ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
225 &tw6->tw_v6_daddr);
261 } 226 }
262#endif 227#endif
263 228 nlh->nlmsg_len = skb_tail_pointer(skb) - previous_tail;
264 return nlmsg_end(skb, nlh); 229 return skb->len;
230nlmsg_failure:
231 nlmsg_trim(skb, previous_tail);
232 return -EMSGSIZE;
265} 233}
266 234
267static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, 235static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
268 struct inet_diag_req_v2 *r, 236 int ext, u32 pid, u32 seq, u16 nlmsg_flags,
269 struct user_namespace *user_ns,
270 u32 portid, u32 seq, u16 nlmsg_flags,
271 const struct nlmsghdr *unlh) 237 const struct nlmsghdr *unlh)
272{ 238{
273 if (sk->sk_state == TCP_TIME_WAIT) 239 if (sk->sk_state == TCP_TIME_WAIT)
274 return inet_twsk_diag_fill((struct inet_timewait_sock *)sk, 240 return inet_twsk_diag_fill((struct inet_timewait_sock *)sk,
275 skb, r, portid, seq, nlmsg_flags, 241 skb, ext, pid, seq, nlmsg_flags,
276 unlh); 242 unlh);
277 return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq, nlmsg_flags, unlh); 243 return inet_csk_diag_fill(sk, skb, ext, pid, seq, nlmsg_flags, unlh);
278} 244}
279 245
280int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb, 246static int inet_diag_get_exact(struct sk_buff *in_skb,
281 const struct nlmsghdr *nlh, struct inet_diag_req_v2 *req) 247 const struct nlmsghdr *nlh)
282{ 248{
283 int err; 249 int err;
284 struct sock *sk; 250 struct sock *sk;
251 struct inet_diag_req *req = NLMSG_DATA(nlh);
285 struct sk_buff *rep; 252 struct sk_buff *rep;
286 struct net *net = sock_net(in_skb->sk); 253 struct inet_hashinfo *hashinfo;
254 const struct inet_diag_handler *handler;
287 255
256 handler = inet_diag_lock_handler(nlh->nlmsg_type);
257 if (IS_ERR(handler)) {
258 err = PTR_ERR(handler);
259 goto unlock;
260 }
261
262 hashinfo = handler->idiag_hashinfo;
288 err = -EINVAL; 263 err = -EINVAL;
289 if (req->sdiag_family == AF_INET) { 264
290 sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0], 265 if (req->idiag_family == AF_INET) {
266 sk = inet_lookup(&init_net, hashinfo, req->id.idiag_dst[0],
291 req->id.idiag_dport, req->id.idiag_src[0], 267 req->id.idiag_dport, req->id.idiag_src[0],
292 req->id.idiag_sport, req->id.idiag_if); 268 req->id.idiag_sport, req->id.idiag_if);
293 } 269 }
294#if IS_ENABLED(CONFIG_IPV6) 270#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
295 else if (req->sdiag_family == AF_INET6) { 271 else if (req->idiag_family == AF_INET6) {
296 sk = inet6_lookup(net, hashinfo, 272 sk = inet6_lookup(&init_net, hashinfo,
297 (struct in6_addr *)req->id.idiag_dst, 273 (struct in6_addr *)req->id.idiag_dst,
298 req->id.idiag_dport, 274 req->id.idiag_dport,
299 (struct in6_addr *)req->id.idiag_src, 275 (struct in6_addr *)req->id.idiag_src,
@@ -302,35 +278,37 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s
302 } 278 }
303#endif 279#endif
304 else { 280 else {
305 goto out_nosk; 281 goto unlock;
306 } 282 }
307 283
308 err = -ENOENT; 284 err = -ENOENT;
309 if (sk == NULL) 285 if (sk == NULL)
310 goto out_nosk; 286 goto unlock;
311 287
312 err = sock_diag_check_cookie(sk, req->id.idiag_cookie); 288 err = -ESTALE;
313 if (err) 289 if ((req->id.idiag_cookie[0] != INET_DIAG_NOCOOKIE ||
290 req->id.idiag_cookie[1] != INET_DIAG_NOCOOKIE) &&
291 ((u32)(unsigned long)sk != req->id.idiag_cookie[0] ||
292 (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.idiag_cookie[1]))
314 goto out; 293 goto out;
315 294
316 rep = nlmsg_new(sizeof(struct inet_diag_msg) + 295 err = -ENOMEM;
317 sizeof(struct inet_diag_meminfo) + 296 rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) +
318 sizeof(struct tcp_info) + 64, GFP_KERNEL); 297 sizeof(struct inet_diag_meminfo) +
319 if (!rep) { 298 handler->idiag_info_size + 64)),
320 err = -ENOMEM; 299 GFP_KERNEL);
300 if (!rep)
321 goto out; 301 goto out;
322 }
323 302
324 err = sk_diag_fill(sk, rep, req, 303 err = sk_diag_fill(sk, rep, req->idiag_ext,
325 sk_user_ns(NETLINK_CB(in_skb).ssk), 304 NETLINK_CB(in_skb).pid,
326 NETLINK_CB(in_skb).portid,
327 nlh->nlmsg_seq, 0, nlh); 305 nlh->nlmsg_seq, 0, nlh);
328 if (err < 0) { 306 if (err < 0) {
329 WARN_ON(err == -EMSGSIZE); 307 WARN_ON(err == -EMSGSIZE);
330 nlmsg_free(rep); 308 kfree_skb(rep);
331 goto out; 309 goto out;
332 } 310 }
333 err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid, 311 err = netlink_unicast(idiagnl, rep, NETLINK_CB(in_skb).pid,
334 MSG_DONTWAIT); 312 MSG_DONTWAIT);
335 if (err > 0) 313 if (err > 0)
336 err = 0; 314 err = 0;
@@ -342,25 +320,8 @@ out:
342 else 320 else
343 sock_put(sk); 321 sock_put(sk);
344 } 322 }
345out_nosk: 323unlock:
346 return err;
347}
348EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk);
349
350static int inet_diag_get_exact(struct sk_buff *in_skb,
351 const struct nlmsghdr *nlh,
352 struct inet_diag_req_v2 *req)
353{
354 const struct inet_diag_handler *handler;
355 int err;
356
357 handler = inet_diag_lock_handler(req->sdiag_protocol);
358 if (IS_ERR(handler))
359 err = PTR_ERR(handler);
360 else
361 err = handler->dump_one(in_skb, nlh, req);
362 inet_diag_unlock_handler(handler); 324 inet_diag_unlock_handler(handler);
363
364 return err; 325 return err;
365} 326}
366 327
@@ -391,12 +352,9 @@ static int bitstring_match(const __be32 *a1, const __be32 *a2, int bits)
391} 352}
392 353
393 354
394static int inet_diag_bc_run(const struct nlattr *_bc, 355static int inet_diag_bc_run(const void *bc, int len,
395 const struct inet_diag_entry *entry) 356 const struct inet_diag_entry *entry)
396{ 357{
397 const void *bc = nla_data(_bc);
398 int len = nla_len(_bc);
399
400 while (len > 0) { 358 while (len > 0) {
401 int yes = 1; 359 int yes = 1;
402 const struct inet_diag_bc_op *op = bc; 360 const struct inet_diag_bc_op *op = bc;
@@ -435,31 +393,25 @@ static int inet_diag_bc_run(const struct nlattr *_bc,
435 break; 393 break;
436 } 394 }
437 395
396 if (cond->prefix_len == 0)
397 break;
398
438 if (op->code == INET_DIAG_BC_S_COND) 399 if (op->code == INET_DIAG_BC_S_COND)
439 addr = entry->saddr; 400 addr = entry->saddr;
440 else 401 else
441 addr = entry->daddr; 402 addr = entry->daddr;
442 403
443 if (cond->family != AF_UNSPEC &&
444 cond->family != entry->family) {
445 if (entry->family == AF_INET6 &&
446 cond->family == AF_INET) {
447 if (addr[0] == 0 && addr[1] == 0 &&
448 addr[2] == htonl(0xffff) &&
449 bitstring_match(addr + 3,
450 cond->addr,
451 cond->prefix_len))
452 break;
453 }
454 yes = 0;
455 break;
456 }
457
458 if (cond->prefix_len == 0)
459 break;
460 if (bitstring_match(addr, cond->addr, 404 if (bitstring_match(addr, cond->addr,
461 cond->prefix_len)) 405 cond->prefix_len))
462 break; 406 break;
407 if (entry->family == AF_INET6 &&
408 cond->family == AF_INET) {
409 if (addr[0] == 0 && addr[1] == 0 &&
410 addr[2] == htonl(0xffff) &&
411 bitstring_match(addr + 3, cond->addr,
412 cond->prefix_len))
413 break;
414 }
463 yes = 0; 415 yes = 0;
464 break; 416 break;
465 } 417 }
@@ -476,35 +428,6 @@ static int inet_diag_bc_run(const struct nlattr *_bc,
476 return len == 0; 428 return len == 0;
477} 429}
478 430
479int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
480{
481 struct inet_diag_entry entry;
482 struct inet_sock *inet = inet_sk(sk);
483
484 if (bc == NULL)
485 return 1;
486
487 entry.family = sk->sk_family;
488#if IS_ENABLED(CONFIG_IPV6)
489 if (entry.family == AF_INET6) {
490 struct ipv6_pinfo *np = inet6_sk(sk);
491
492 entry.saddr = np->rcv_saddr.s6_addr32;
493 entry.daddr = np->daddr.s6_addr32;
494 } else
495#endif
496 {
497 entry.saddr = &inet->inet_rcv_saddr;
498 entry.daddr = &inet->inet_daddr;
499 }
500 entry.sport = inet->inet_num;
501 entry.dport = ntohs(inet->inet_dport);
502 entry.userlocks = sk->sk_userlocks;
503
504 return inet_diag_bc_run(bc, &entry);
505}
506EXPORT_SYMBOL_GPL(inet_diag_bc_sk);
507
508static int valid_cc(const void *bc, int len, int cc) 431static int valid_cc(const void *bc, int len, int cc)
509{ 432{
510 while (len >= 0) { 433 while (len >= 0) {
@@ -522,55 +445,6 @@ static int valid_cc(const void *bc, int len, int cc)
522 return 0; 445 return 0;
523} 446}
524 447
525/* Validate an inet_diag_hostcond. */
526static bool valid_hostcond(const struct inet_diag_bc_op *op, int len,
527 int *min_len)
528{
529 int addr_len;
530 struct inet_diag_hostcond *cond;
531
532 /* Check hostcond space. */
533 *min_len += sizeof(struct inet_diag_hostcond);
534 if (len < *min_len)
535 return false;
536 cond = (struct inet_diag_hostcond *)(op + 1);
537
538 /* Check address family and address length. */
539 switch (cond->family) {
540 case AF_UNSPEC:
541 addr_len = 0;
542 break;
543 case AF_INET:
544 addr_len = sizeof(struct in_addr);
545 break;
546 case AF_INET6:
547 addr_len = sizeof(struct in6_addr);
548 break;
549 default:
550 return false;
551 }
552 *min_len += addr_len;
553 if (len < *min_len)
554 return false;
555
556 /* Check prefix length (in bits) vs address length (in bytes). */
557 if (cond->prefix_len > 8 * addr_len)
558 return false;
559
560 return true;
561}
562
563/* Validate a port comparison operator. */
564static inline bool valid_port_comparison(const struct inet_diag_bc_op *op,
565 int len, int *min_len)
566{
567 /* Port comparisons put the port in a follow-on inet_diag_bc_op. */
568 *min_len += sizeof(struct inet_diag_bc_op);
569 if (len < *min_len)
570 return false;
571 return true;
572}
573
574static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) 448static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
575{ 449{
576 const void *bc = bytecode; 450 const void *bc = bytecode;
@@ -578,39 +452,29 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
578 452
579 while (len > 0) { 453 while (len > 0) {
580 const struct inet_diag_bc_op *op = bc; 454 const struct inet_diag_bc_op *op = bc;
581 int min_len = sizeof(struct inet_diag_bc_op);
582 455
583//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len); 456//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
584 switch (op->code) { 457 switch (op->code) {
458 case INET_DIAG_BC_AUTO:
585 case INET_DIAG_BC_S_COND: 459 case INET_DIAG_BC_S_COND:
586 case INET_DIAG_BC_D_COND: 460 case INET_DIAG_BC_D_COND:
587 if (!valid_hostcond(bc, len, &min_len))
588 return -EINVAL;
589 break;
590 case INET_DIAG_BC_S_GE: 461 case INET_DIAG_BC_S_GE:
591 case INET_DIAG_BC_S_LE: 462 case INET_DIAG_BC_S_LE:
592 case INET_DIAG_BC_D_GE: 463 case INET_DIAG_BC_D_GE:
593 case INET_DIAG_BC_D_LE: 464 case INET_DIAG_BC_D_LE:
594 if (!valid_port_comparison(bc, len, &min_len)) 465 case INET_DIAG_BC_JMP:
466 if (op->no < 4 || op->no > len + 4 || op->no & 3)
467 return -EINVAL;
468 if (op->no < len &&
469 !valid_cc(bytecode, bytecode_len, len - op->no))
595 return -EINVAL; 470 return -EINVAL;
596 break; 471 break;
597 case INET_DIAG_BC_AUTO:
598 case INET_DIAG_BC_JMP:
599 case INET_DIAG_BC_NOP: 472 case INET_DIAG_BC_NOP:
600 break; 473 break;
601 default: 474 default:
602 return -EINVAL; 475 return -EINVAL;
603 } 476 }
604 477 if (op->yes < 4 || op->yes > len + 4 || op->yes & 3)
605 if (op->code != INET_DIAG_BC_NOP) {
606 if (op->no < min_len || op->no > len + 4 || op->no & 3)
607 return -EINVAL;
608 if (op->no < len &&
609 !valid_cc(bytecode, bytecode_len, len - op->no))
610 return -EINVAL;
611 }
612
613 if (op->yes < min_len || op->yes > len + 4 || op->yes & 3)
614 return -EINVAL; 478 return -EINVAL;
615 bc += op->yes; 479 bc += op->yes;
616 len -= op->yes; 480 len -= op->yes;
@@ -620,30 +484,57 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
620 484
621static int inet_csk_diag_dump(struct sock *sk, 485static int inet_csk_diag_dump(struct sock *sk,
622 struct sk_buff *skb, 486 struct sk_buff *skb,
623 struct netlink_callback *cb, 487 struct netlink_callback *cb)
624 struct inet_diag_req_v2 *r,
625 const struct nlattr *bc)
626{ 488{
627 if (!inet_diag_bc_sk(bc, sk)) 489 struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
628 return 0;
629 490
630 return inet_csk_diag_fill(sk, skb, r, 491 if (nlmsg_attrlen(cb->nlh, sizeof(*r))) {
631 sk_user_ns(NETLINK_CB(cb->skb).ssk), 492 struct inet_diag_entry entry;
632 NETLINK_CB(cb->skb).portid, 493 const struct nlattr *bc = nlmsg_find_attr(cb->nlh,
494 sizeof(*r),
495 INET_DIAG_REQ_BYTECODE);
496 struct inet_sock *inet = inet_sk(sk);
497
498 entry.family = sk->sk_family;
499#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
500 if (entry.family == AF_INET6) {
501 struct ipv6_pinfo *np = inet6_sk(sk);
502
503 entry.saddr = np->rcv_saddr.s6_addr32;
504 entry.daddr = np->daddr.s6_addr32;
505 } else
506#endif
507 {
508 entry.saddr = &inet->inet_rcv_saddr;
509 entry.daddr = &inet->inet_daddr;
510 }
511 entry.sport = inet->inet_num;
512 entry.dport = ntohs(inet->inet_dport);
513 entry.userlocks = sk->sk_userlocks;
514
515 if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry))
516 return 0;
517 }
518
519 return inet_csk_diag_fill(sk, skb, r->idiag_ext,
520 NETLINK_CB(cb->skb).pid,
633 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); 521 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
634} 522}
635 523
636static int inet_twsk_diag_dump(struct inet_timewait_sock *tw, 524static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,
637 struct sk_buff *skb, 525 struct sk_buff *skb,
638 struct netlink_callback *cb, 526 struct netlink_callback *cb)
639 struct inet_diag_req_v2 *r,
640 const struct nlattr *bc)
641{ 527{
642 if (bc != NULL) { 528 struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
529
530 if (nlmsg_attrlen(cb->nlh, sizeof(*r))) {
643 struct inet_diag_entry entry; 531 struct inet_diag_entry entry;
532 const struct nlattr *bc = nlmsg_find_attr(cb->nlh,
533 sizeof(*r),
534 INET_DIAG_REQ_BYTECODE);
644 535
645 entry.family = tw->tw_family; 536 entry.family = tw->tw_family;
646#if IS_ENABLED(CONFIG_IPV6) 537#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
647 if (tw->tw_family == AF_INET6) { 538 if (tw->tw_family == AF_INET6) {
648 struct inet6_timewait_sock *tw6 = 539 struct inet6_timewait_sock *tw6 =
649 inet6_twsk((struct sock *)tw); 540 inet6_twsk((struct sock *)tw);
@@ -659,70 +550,38 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw,
659 entry.dport = ntohs(tw->tw_dport); 550 entry.dport = ntohs(tw->tw_dport);
660 entry.userlocks = 0; 551 entry.userlocks = 0;
661 552
662 if (!inet_diag_bc_run(bc, &entry)) 553 if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry))
663 return 0; 554 return 0;
664 } 555 }
665 556
666 return inet_twsk_diag_fill(tw, skb, r, 557 return inet_twsk_diag_fill(tw, skb, r->idiag_ext,
667 NETLINK_CB(cb->skb).portid, 558 NETLINK_CB(cb->skb).pid,
668 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); 559 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
669} 560}
670 561
671/* Get the IPv4, IPv6, or IPv4-mapped-IPv6 local and remote addresses
672 * from a request_sock. For IPv4-mapped-IPv6 we must map IPv4 to IPv6.
673 */
674static inline void inet_diag_req_addrs(const struct sock *sk,
675 const struct request_sock *req,
676 struct inet_diag_entry *entry)
677{
678 struct inet_request_sock *ireq = inet_rsk(req);
679
680#if IS_ENABLED(CONFIG_IPV6)
681 if (sk->sk_family == AF_INET6) {
682 if (req->rsk_ops->family == AF_INET6) {
683 entry->saddr = inet6_rsk(req)->loc_addr.s6_addr32;
684 entry->daddr = inet6_rsk(req)->rmt_addr.s6_addr32;
685 } else if (req->rsk_ops->family == AF_INET) {
686 ipv6_addr_set_v4mapped(ireq->loc_addr,
687 &entry->saddr_storage);
688 ipv6_addr_set_v4mapped(ireq->rmt_addr,
689 &entry->daddr_storage);
690 entry->saddr = entry->saddr_storage.s6_addr32;
691 entry->daddr = entry->daddr_storage.s6_addr32;
692 }
693 } else
694#endif
695 {
696 entry->saddr = &ireq->loc_addr;
697 entry->daddr = &ireq->rmt_addr;
698 }
699}
700
701static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, 562static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
702 struct request_sock *req, 563 struct request_sock *req, u32 pid, u32 seq,
703 struct user_namespace *user_ns,
704 u32 portid, u32 seq,
705 const struct nlmsghdr *unlh) 564 const struct nlmsghdr *unlh)
706{ 565{
707 const struct inet_request_sock *ireq = inet_rsk(req); 566 const struct inet_request_sock *ireq = inet_rsk(req);
708 struct inet_sock *inet = inet_sk(sk); 567 struct inet_sock *inet = inet_sk(sk);
568 unsigned char *b = skb_tail_pointer(skb);
709 struct inet_diag_msg *r; 569 struct inet_diag_msg *r;
710 struct nlmsghdr *nlh; 570 struct nlmsghdr *nlh;
711 long tmo; 571 long tmo;
712 572
713 nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), 573 nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r));
714 NLM_F_MULTI); 574 nlh->nlmsg_flags = NLM_F_MULTI;
715 if (!nlh) 575 r = NLMSG_DATA(nlh);
716 return -EMSGSIZE;
717 576
718 r = nlmsg_data(nlh);
719 r->idiag_family = sk->sk_family; 577 r->idiag_family = sk->sk_family;
720 r->idiag_state = TCP_SYN_RECV; 578 r->idiag_state = TCP_SYN_RECV;
721 r->idiag_timer = 1; 579 r->idiag_timer = 1;
722 r->idiag_retrans = req->num_retrans; 580 r->idiag_retrans = req->retrans;
723 581
724 r->id.idiag_if = sk->sk_bound_dev_if; 582 r->id.idiag_if = sk->sk_bound_dev_if;
725 sock_diag_save_cookie(req, r->id.idiag_cookie); 583 r->id.idiag_cookie[0] = (u32)(unsigned long)req;
584 r->id.idiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1);
726 585
727 tmo = req->expires - jiffies; 586 tmo = req->expires - jiffies;
728 if (tmo < 0) 587 if (tmo < 0)
@@ -735,28 +594,33 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
735 r->idiag_expires = jiffies_to_msecs(tmo); 594 r->idiag_expires = jiffies_to_msecs(tmo);
736 r->idiag_rqueue = 0; 595 r->idiag_rqueue = 0;
737 r->idiag_wqueue = 0; 596 r->idiag_wqueue = 0;
738 r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); 597 r->idiag_uid = sock_i_uid(sk);
739 r->idiag_inode = 0; 598 r->idiag_inode = 0;
740#if IS_ENABLED(CONFIG_IPV6) 599#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
741 if (r->idiag_family == AF_INET6) { 600 if (r->idiag_family == AF_INET6) {
742 struct inet_diag_entry entry; 601 ipv6_addr_copy((struct in6_addr *)r->id.idiag_src,
743 inet_diag_req_addrs(sk, req, &entry); 602 &inet6_rsk(req)->loc_addr);
744 memcpy(r->id.idiag_src, entry.saddr, sizeof(struct in6_addr)); 603 ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst,
745 memcpy(r->id.idiag_dst, entry.daddr, sizeof(struct in6_addr)); 604 &inet6_rsk(req)->rmt_addr);
746 } 605 }
747#endif 606#endif
607 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
608
609 return skb->len;
748 610
749 return nlmsg_end(skb, nlh); 611nlmsg_failure:
612 nlmsg_trim(skb, b);
613 return -1;
750} 614}
751 615
752static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, 616static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
753 struct netlink_callback *cb, 617 struct netlink_callback *cb)
754 struct inet_diag_req_v2 *r,
755 const struct nlattr *bc)
756{ 618{
757 struct inet_diag_entry entry; 619 struct inet_diag_entry entry;
620 struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
758 struct inet_connection_sock *icsk = inet_csk(sk); 621 struct inet_connection_sock *icsk = inet_csk(sk);
759 struct listen_sock *lopt; 622 struct listen_sock *lopt;
623 const struct nlattr *bc = NULL;
760 struct inet_sock *inet = inet_sk(sk); 624 struct inet_sock *inet = inet_sk(sk);
761 int j, s_j; 625 int j, s_j;
762 int reqnum, s_reqnum; 626 int reqnum, s_reqnum;
@@ -776,7 +640,9 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
776 if (!lopt || !lopt->qlen) 640 if (!lopt || !lopt->qlen)
777 goto out; 641 goto out;
778 642
779 if (bc != NULL) { 643 if (nlmsg_attrlen(cb->nlh, sizeof(*r))) {
644 bc = nlmsg_find_attr(cb->nlh, sizeof(*r),
645 INET_DIAG_REQ_BYTECODE);
780 entry.sport = inet->inet_num; 646 entry.sport = inet->inet_num;
781 entry.userlocks = sk->sk_userlocks; 647 entry.userlocks = sk->sk_userlocks;
782 } 648 }
@@ -795,16 +661,27 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
795 continue; 661 continue;
796 662
797 if (bc) { 663 if (bc) {
798 inet_diag_req_addrs(sk, req, &entry); 664 entry.saddr =
665#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
666 (entry.family == AF_INET6) ?
667 inet6_rsk(req)->loc_addr.s6_addr32 :
668#endif
669 &ireq->loc_addr;
670 entry.daddr =
671#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
672 (entry.family == AF_INET6) ?
673 inet6_rsk(req)->rmt_addr.s6_addr32 :
674#endif
675 &ireq->rmt_addr;
799 entry.dport = ntohs(ireq->rmt_port); 676 entry.dport = ntohs(ireq->rmt_port);
800 677
801 if (!inet_diag_bc_run(bc, &entry)) 678 if (!inet_diag_bc_run(nla_data(bc),
679 nla_len(bc), &entry))
802 continue; 680 continue;
803 } 681 }
804 682
805 err = inet_diag_fill_req(skb, sk, req, 683 err = inet_diag_fill_req(skb, sk, req,
806 sk_user_ns(NETLINK_CB(cb->skb).ssk), 684 NETLINK_CB(cb->skb).pid,
807 NETLINK_CB(cb->skb).portid,
808 cb->nlh->nlmsg_seq, cb->nlh); 685 cb->nlh->nlmsg_seq, cb->nlh);
809 if (err < 0) { 686 if (err < 0) {
810 cb->args[3] = j + 1; 687 cb->args[3] = j + 1;
@@ -822,12 +699,19 @@ out:
822 return err; 699 return err;
823} 700}
824 701
825void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, 702static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
826 struct netlink_callback *cb, struct inet_diag_req_v2 *r, struct nlattr *bc)
827{ 703{
828 int i, num; 704 int i, num;
829 int s_i, s_num; 705 int s_i, s_num;
830 struct net *net = sock_net(skb->sk); 706 struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
707 const struct inet_diag_handler *handler;
708 struct inet_hashinfo *hashinfo;
709
710 handler = inet_diag_lock_handler(cb->nlh->nlmsg_type);
711 if (IS_ERR(handler))
712 goto unlock;
713
714 hashinfo = handler->idiag_hashinfo;
831 715
832 s_i = cb->args[1]; 716 s_i = cb->args[1];
833 s_num = num = cb->args[2]; 717 s_num = num = cb->args[2];
@@ -847,18 +731,11 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
847 sk_nulls_for_each(sk, node, &ilb->head) { 731 sk_nulls_for_each(sk, node, &ilb->head) {
848 struct inet_sock *inet = inet_sk(sk); 732 struct inet_sock *inet = inet_sk(sk);
849 733
850 if (!net_eq(sock_net(sk), net))
851 continue;
852
853 if (num < s_num) { 734 if (num < s_num) {
854 num++; 735 num++;
855 continue; 736 continue;
856 } 737 }
857 738
858 if (r->sdiag_family != AF_UNSPEC &&
859 sk->sk_family != r->sdiag_family)
860 goto next_listen;
861
862 if (r->id.idiag_sport != inet->inet_sport && 739 if (r->id.idiag_sport != inet->inet_sport &&
863 r->id.idiag_sport) 740 r->id.idiag_sport)
864 goto next_listen; 741 goto next_listen;
@@ -868,7 +745,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
868 cb->args[3] > 0) 745 cb->args[3] > 0)
869 goto syn_recv; 746 goto syn_recv;
870 747
871 if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) { 748 if (inet_csk_diag_dump(sk, skb, cb) < 0) {
872 spin_unlock_bh(&ilb->lock); 749 spin_unlock_bh(&ilb->lock);
873 goto done; 750 goto done;
874 } 751 }
@@ -877,7 +754,7 @@ syn_recv:
877 if (!(r->idiag_states & TCPF_SYN_RECV)) 754 if (!(r->idiag_states & TCPF_SYN_RECV))
878 goto next_listen; 755 goto next_listen;
879 756
880 if (inet_diag_dump_reqs(skb, sk, cb, r, bc) < 0) { 757 if (inet_diag_dump_reqs(skb, sk, cb) < 0) {
881 spin_unlock_bh(&ilb->lock); 758 spin_unlock_bh(&ilb->lock);
882 goto done; 759 goto done;
883 } 760 }
@@ -899,7 +776,7 @@ skip_listen_ht:
899 } 776 }
900 777
901 if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV))) 778 if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV)))
902 goto out; 779 goto unlock;
903 780
904 for (i = s_i; i <= hashinfo->ehash_mask; i++) { 781 for (i = s_i; i <= hashinfo->ehash_mask; i++) {
905 struct inet_ehash_bucket *head = &hashinfo->ehash[i]; 782 struct inet_ehash_bucket *head = &hashinfo->ehash[i];
@@ -920,22 +797,17 @@ skip_listen_ht:
920 sk_nulls_for_each(sk, node, &head->chain) { 797 sk_nulls_for_each(sk, node, &head->chain) {
921 struct inet_sock *inet = inet_sk(sk); 798 struct inet_sock *inet = inet_sk(sk);
922 799
923 if (!net_eq(sock_net(sk), net))
924 continue;
925 if (num < s_num) 800 if (num < s_num)
926 goto next_normal; 801 goto next_normal;
927 if (!(r->idiag_states & (1 << sk->sk_state))) 802 if (!(r->idiag_states & (1 << sk->sk_state)))
928 goto next_normal; 803 goto next_normal;
929 if (r->sdiag_family != AF_UNSPEC &&
930 sk->sk_family != r->sdiag_family)
931 goto next_normal;
932 if (r->id.idiag_sport != inet->inet_sport && 804 if (r->id.idiag_sport != inet->inet_sport &&
933 r->id.idiag_sport) 805 r->id.idiag_sport)
934 goto next_normal; 806 goto next_normal;
935 if (r->id.idiag_dport != inet->inet_dport && 807 if (r->id.idiag_dport != inet->inet_dport &&
936 r->id.idiag_dport) 808 r->id.idiag_dport)
937 goto next_normal; 809 goto next_normal;
938 if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) { 810 if (inet_csk_diag_dump(sk, skb, cb) < 0) {
939 spin_unlock_bh(lock); 811 spin_unlock_bh(lock);
940 goto done; 812 goto done;
941 } 813 }
@@ -948,21 +820,16 @@ next_normal:
948 820
949 inet_twsk_for_each(tw, node, 821 inet_twsk_for_each(tw, node,
950 &head->twchain) { 822 &head->twchain) {
951 if (!net_eq(twsk_net(tw), net))
952 continue;
953 823
954 if (num < s_num) 824 if (num < s_num)
955 goto next_dying; 825 goto next_dying;
956 if (r->sdiag_family != AF_UNSPEC &&
957 tw->tw_family != r->sdiag_family)
958 goto next_dying;
959 if (r->id.idiag_sport != tw->tw_sport && 826 if (r->id.idiag_sport != tw->tw_sport &&
960 r->id.idiag_sport) 827 r->id.idiag_sport)
961 goto next_dying; 828 goto next_dying;
962 if (r->id.idiag_dport != tw->tw_dport && 829 if (r->id.idiag_dport != tw->tw_dport &&
963 r->id.idiag_dport) 830 r->id.idiag_dport)
964 goto next_dying; 831 goto next_dying;
965 if (inet_twsk_diag_dump(tw, skb, cb, r, bc) < 0) { 832 if (inet_twsk_diag_dump(tw, skb, cb) < 0) {
966 spin_unlock_bh(lock); 833 spin_unlock_bh(lock);
967 goto done; 834 goto done;
968 } 835 }
@@ -976,89 +843,15 @@ next_dying:
976done: 843done:
977 cb->args[1] = i; 844 cb->args[1] = i;
978 cb->args[2] = num; 845 cb->args[2] = num;
979out: 846unlock:
980 ;
981}
982EXPORT_SYMBOL_GPL(inet_diag_dump_icsk);
983
984static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
985 struct inet_diag_req_v2 *r, struct nlattr *bc)
986{
987 const struct inet_diag_handler *handler;
988 int err = 0;
989
990 handler = inet_diag_lock_handler(r->sdiag_protocol);
991 if (!IS_ERR(handler))
992 handler->dump(skb, cb, r, bc);
993 else
994 err = PTR_ERR(handler);
995 inet_diag_unlock_handler(handler); 847 inet_diag_unlock_handler(handler);
996 848 return skb->len;
997 return err ? : skb->len;
998}
999
1000static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
1001{
1002 struct nlattr *bc = NULL;
1003 int hdrlen = sizeof(struct inet_diag_req_v2);
1004
1005 if (nlmsg_attrlen(cb->nlh, hdrlen))
1006 bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
1007
1008 return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc);
1009} 849}
1010 850
1011static inline int inet_diag_type2proto(int type) 851static int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
1012{ 852{
1013 switch (type) {
1014 case TCPDIAG_GETSOCK:
1015 return IPPROTO_TCP;
1016 case DCCPDIAG_GETSOCK:
1017 return IPPROTO_DCCP;
1018 default:
1019 return 0;
1020 }
1021}
1022
1023static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb)
1024{
1025 struct inet_diag_req *rc = nlmsg_data(cb->nlh);
1026 struct inet_diag_req_v2 req;
1027 struct nlattr *bc = NULL;
1028 int hdrlen = sizeof(struct inet_diag_req); 853 int hdrlen = sizeof(struct inet_diag_req);
1029 854
1030 req.sdiag_family = AF_UNSPEC; /* compatibility */
1031 req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type);
1032 req.idiag_ext = rc->idiag_ext;
1033 req.idiag_states = rc->idiag_states;
1034 req.id = rc->id;
1035
1036 if (nlmsg_attrlen(cb->nlh, hdrlen))
1037 bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
1038
1039 return __inet_diag_dump(skb, cb, &req, bc);
1040}
1041
1042static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
1043 const struct nlmsghdr *nlh)
1044{
1045 struct inet_diag_req *rc = nlmsg_data(nlh);
1046 struct inet_diag_req_v2 req;
1047
1048 req.sdiag_family = rc->idiag_family;
1049 req.sdiag_protocol = inet_diag_type2proto(nlh->nlmsg_type);
1050 req.idiag_ext = rc->idiag_ext;
1051 req.idiag_states = rc->idiag_states;
1052 req.id = rc->id;
1053
1054 return inet_diag_get_exact(in_skb, nlh, &req);
1055}
1056
1057static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
1058{
1059 int hdrlen = sizeof(struct inet_diag_req);
1060 struct net *net = sock_net(skb->sk);
1061
1062 if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX || 855 if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX ||
1063 nlmsg_len(nlh) < hdrlen) 856 nlmsg_len(nlh) < hdrlen)
1064 return -EINVAL; 857 return -EINVAL;
@@ -1074,62 +867,29 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
1074 inet_diag_bc_audit(nla_data(attr), nla_len(attr))) 867 inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
1075 return -EINVAL; 868 return -EINVAL;
1076 } 869 }
1077 {
1078 struct netlink_dump_control c = {
1079 .dump = inet_diag_dump_compat,
1080 };
1081 return netlink_dump_start(net->diag_nlsk, skb, nlh, &c);
1082 }
1083 }
1084
1085 return inet_diag_get_exact_compat(skb, nlh);
1086}
1087
1088static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
1089{
1090 int hdrlen = sizeof(struct inet_diag_req_v2);
1091 struct net *net = sock_net(skb->sk);
1092 870
1093 if (nlmsg_len(h) < hdrlen) 871 return netlink_dump_start(idiagnl, skb, nlh,
1094 return -EINVAL; 872 inet_diag_dump, NULL, 0);
1095
1096 if (h->nlmsg_flags & NLM_F_DUMP) {
1097 if (nlmsg_attrlen(h, hdrlen)) {
1098 struct nlattr *attr;
1099 attr = nlmsg_find_attr(h, hdrlen,
1100 INET_DIAG_REQ_BYTECODE);
1101 if (attr == NULL ||
1102 nla_len(attr) < sizeof(struct inet_diag_bc_op) ||
1103 inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
1104 return -EINVAL;
1105 }
1106 {
1107 struct netlink_dump_control c = {
1108 .dump = inet_diag_dump,
1109 };
1110 return netlink_dump_start(net->diag_nlsk, skb, h, &c);
1111 }
1112 } 873 }
1113 874
1114 return inet_diag_get_exact(skb, h, nlmsg_data(h)); 875 return inet_diag_get_exact(skb, nlh);
1115} 876}
1116 877
1117static const struct sock_diag_handler inet_diag_handler = { 878static DEFINE_MUTEX(inet_diag_mutex);
1118 .family = AF_INET,
1119 .dump = inet_diag_handler_dump,
1120};
1121 879
1122static const struct sock_diag_handler inet6_diag_handler = { 880static void inet_diag_rcv(struct sk_buff *skb)
1123 .family = AF_INET6, 881{
1124 .dump = inet_diag_handler_dump, 882 mutex_lock(&inet_diag_mutex);
1125}; 883 netlink_rcv_skb(skb, &inet_diag_rcv_msg);
884 mutex_unlock(&inet_diag_mutex);
885}
1126 886
1127int inet_diag_register(const struct inet_diag_handler *h) 887int inet_diag_register(const struct inet_diag_handler *h)
1128{ 888{
1129 const __u16 type = h->idiag_type; 889 const __u16 type = h->idiag_type;
1130 int err = -EINVAL; 890 int err = -EINVAL;
1131 891
1132 if (type >= IPPROTO_MAX) 892 if (type >= INET_DIAG_GETSOCK_MAX)
1133 goto out; 893 goto out;
1134 894
1135 mutex_lock(&inet_diag_table_mutex); 895 mutex_lock(&inet_diag_table_mutex);
@@ -1148,7 +908,7 @@ void inet_diag_unregister(const struct inet_diag_handler *h)
1148{ 908{
1149 const __u16 type = h->idiag_type; 909 const __u16 type = h->idiag_type;
1150 910
1151 if (type >= IPPROTO_MAX) 911 if (type >= INET_DIAG_GETSOCK_MAX)
1152 return; 912 return;
1153 913
1154 mutex_lock(&inet_diag_table_mutex); 914 mutex_lock(&inet_diag_table_mutex);
@@ -1159,7 +919,7 @@ EXPORT_SYMBOL_GPL(inet_diag_unregister);
1159 919
1160static int __init inet_diag_init(void) 920static int __init inet_diag_init(void)
1161{ 921{
1162 const int inet_diag_table_size = (IPPROTO_MAX * 922 const int inet_diag_table_size = (INET_DIAG_GETSOCK_MAX *
1163 sizeof(struct inet_diag_handler *)); 923 sizeof(struct inet_diag_handler *));
1164 int err = -ENOMEM; 924 int err = -ENOMEM;
1165 925
@@ -1167,35 +927,25 @@ static int __init inet_diag_init(void)
1167 if (!inet_diag_table) 927 if (!inet_diag_table)
1168 goto out; 928 goto out;
1169 929
1170 err = sock_diag_register(&inet_diag_handler); 930 idiagnl = netlink_kernel_create(&init_net, NETLINK_INET_DIAG, 0,
1171 if (err) 931 inet_diag_rcv, NULL, THIS_MODULE);
1172 goto out_free_nl; 932 if (idiagnl == NULL)
1173 933 goto out_free_table;
1174 err = sock_diag_register(&inet6_diag_handler); 934 err = 0;
1175 if (err)
1176 goto out_free_inet;
1177
1178 sock_diag_register_inet_compat(inet_diag_rcv_msg_compat);
1179out: 935out:
1180 return err; 936 return err;
1181 937out_free_table:
1182out_free_inet:
1183 sock_diag_unregister(&inet_diag_handler);
1184out_free_nl:
1185 kfree(inet_diag_table); 938 kfree(inet_diag_table);
1186 goto out; 939 goto out;
1187} 940}
1188 941
1189static void __exit inet_diag_exit(void) 942static void __exit inet_diag_exit(void)
1190{ 943{
1191 sock_diag_unregister(&inet6_diag_handler); 944 netlink_kernel_release(idiagnl);
1192 sock_diag_unregister(&inet_diag_handler);
1193 sock_diag_unregister_inet_compat(inet_diag_rcv_msg_compat);
1194 kfree(inet_diag_table); 945 kfree(inet_diag_table);
1195} 946}
1196 947
1197module_init(inet_diag_init); 948module_init(inet_diag_init);
1198module_exit(inet_diag_exit); 949module_exit(inet_diag_exit);
1199MODULE_LICENSE("GPL"); 950MODULE_LICENSE("GPL");
1200MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2 /* AF_INET */); 951MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_INET_DIAG);
1201MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10 /* AF_INET6 */);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 4750d2b74d7..5ff2a51b6d0 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -89,7 +89,7 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
89 nf->low_thresh = 0; 89 nf->low_thresh = 0;
90 90
91 local_bh_disable(); 91 local_bh_disable();
92 inet_frag_evictor(nf, f, true); 92 inet_frag_evictor(nf, f);
93 local_bh_enable(); 93 local_bh_enable();
94} 94}
95EXPORT_SYMBOL(inet_frags_exit_net); 95EXPORT_SYMBOL(inet_frags_exit_net);
@@ -158,16 +158,11 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
158} 158}
159EXPORT_SYMBOL(inet_frag_destroy); 159EXPORT_SYMBOL(inet_frag_destroy);
160 160
161int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force) 161int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f)
162{ 162{
163 struct inet_frag_queue *q; 163 struct inet_frag_queue *q;
164 int work, evicted = 0; 164 int work, evicted = 0;
165 165
166 if (!force) {
167 if (atomic_read(&nf->mem) <= nf->high_thresh)
168 return 0;
169 }
170
171 work = atomic_read(&nf->mem) - nf->low_thresh; 166 work = atomic_read(&nf->mem) - nf->low_thresh;
172 while (work > 0) { 167 while (work > 0) {
173 read_lock(&f->lock); 168 read_lock(&f->lock);
@@ -248,12 +243,12 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
248 if (q == NULL) 243 if (q == NULL)
249 return NULL; 244 return NULL;
250 245
251 q->net = nf;
252 f->constructor(q, arg); 246 f->constructor(q, arg);
253 atomic_add(f->qsize, &nf->mem); 247 atomic_add(f->qsize, &nf->mem);
254 setup_timer(&q->timer, f->frag_expire, (unsigned long)q); 248 setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
255 spin_lock_init(&q->lock); 249 spin_lock_init(&q->lock);
256 atomic_set(&q->refcnt, 1); 250 atomic_set(&q->refcnt, 1);
251 q->net = nf;
257 252
258 return q; 253 return q;
259} 254}
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index fa3ae814871..984ec656b03 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -217,7 +217,7 @@ begin:
217} 217}
218EXPORT_SYMBOL_GPL(__inet_lookup_listener); 218EXPORT_SYMBOL_GPL(__inet_lookup_listener);
219 219
220struct sock *__inet_lookup_established(struct net *net, 220struct sock * __inet_lookup_established(struct net *net,
221 struct inet_hashinfo *hashinfo, 221 struct inet_hashinfo *hashinfo,
222 const __be32 saddr, const __be16 sport, 222 const __be32 saddr, const __be16 sport,
223 const __be32 daddr, const u16 hnum, 223 const __be32 daddr, const u16 hnum,
@@ -237,14 +237,12 @@ struct sock *__inet_lookup_established(struct net *net,
237 rcu_read_lock(); 237 rcu_read_lock();
238begin: 238begin:
239 sk_nulls_for_each_rcu(sk, node, &head->chain) { 239 sk_nulls_for_each_rcu(sk, node, &head->chain) {
240 if (sk->sk_hash != hash) 240 if (INET_MATCH(sk, net, hash, acookie,
241 continue; 241 saddr, daddr, ports, dif)) {
242 if (likely(INET_MATCH(sk, net, acookie,
243 saddr, daddr, ports, dif))) {
244 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) 242 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
245 goto begintw; 243 goto begintw;
246 if (unlikely(!INET_MATCH(sk, net, acookie, 244 if (unlikely(!INET_MATCH(sk, net, hash, acookie,
247 saddr, daddr, ports, dif))) { 245 saddr, daddr, ports, dif))) {
248 sock_put(sk); 246 sock_put(sk);
249 goto begin; 247 goto begin;
250 } 248 }
@@ -262,18 +260,14 @@ begin:
262begintw: 260begintw:
263 /* Must check for a TIME_WAIT'er before going to listener hash. */ 261 /* Must check for a TIME_WAIT'er before going to listener hash. */
264 sk_nulls_for_each_rcu(sk, node, &head->twchain) { 262 sk_nulls_for_each_rcu(sk, node, &head->twchain) {
265 if (sk->sk_hash != hash) 263 if (INET_TW_MATCH(sk, net, hash, acookie,
266 continue; 264 saddr, daddr, ports, dif)) {
267 if (likely(INET_TW_MATCH(sk, net, acookie,
268 saddr, daddr, ports,
269 dif))) {
270 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) { 265 if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
271 sk = NULL; 266 sk = NULL;
272 goto out; 267 goto out;
273 } 268 }
274 if (unlikely(!INET_TW_MATCH(sk, net, acookie, 269 if (unlikely(!INET_TW_MATCH(sk, net, hash, acookie,
275 saddr, daddr, ports, 270 saddr, daddr, ports, dif))) {
276 dif))) {
277 sock_put(sk); 271 sock_put(sk);
278 goto begintw; 272 goto begintw;
279 } 273 }
@@ -320,12 +314,10 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
320 314
321 /* Check TIME-WAIT sockets first. */ 315 /* Check TIME-WAIT sockets first. */
322 sk_nulls_for_each(sk2, node, &head->twchain) { 316 sk_nulls_for_each(sk2, node, &head->twchain) {
323 if (sk2->sk_hash != hash) 317 tw = inet_twsk(sk2);
324 continue;
325 318
326 if (likely(INET_TW_MATCH(sk2, net, acookie, 319 if (INET_TW_MATCH(sk2, net, hash, acookie,
327 saddr, daddr, ports, dif))) { 320 saddr, daddr, ports, dif)) {
328 tw = inet_twsk(sk2);
329 if (twsk_unique(sk, sk2, twp)) 321 if (twsk_unique(sk, sk2, twp))
330 goto unique; 322 goto unique;
331 else 323 else
@@ -336,10 +328,8 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
336 328
337 /* And established part... */ 329 /* And established part... */
338 sk_nulls_for_each(sk2, node, &head->chain) { 330 sk_nulls_for_each(sk2, node, &head->chain) {
339 if (sk2->sk_hash != hash) 331 if (INET_MATCH(sk2, net, hash, acookie,
340 continue; 332 saddr, daddr, ports, dif))
341 if (likely(INET_MATCH(sk2, net, acookie,
342 saddr, daddr, ports, dif)))
343 goto not_unique; 333 goto not_unique;
344 } 334 }
345 335
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c
index cc280a3f4f9..ef7ae6049a5 100644
--- a/net/ipv4/inet_lro.c
+++ b/net/ipv4/inet_lro.c
@@ -244,11 +244,11 @@ static void lro_add_frags(struct net_lro_desc *lro_desc,
244 skb->truesize += truesize; 244 skb->truesize += truesize;
245 245
246 skb_frags[0].page_offset += hlen; 246 skb_frags[0].page_offset += hlen;
247 skb_frag_size_sub(&skb_frags[0], hlen); 247 skb_frags[0].size -= hlen;
248 248
249 while (tcp_data_len > 0) { 249 while (tcp_data_len > 0) {
250 *(lro_desc->next_frag) = *skb_frags; 250 *(lro_desc->next_frag) = *skb_frags;
251 tcp_data_len -= skb_frag_size(skb_frags); 251 tcp_data_len -= skb_frags->size;
252 lro_desc->next_frag++; 252 lro_desc->next_frag++;
253 skb_frags++; 253 skb_frags++;
254 skb_shinfo(skb)->nr_frags++; 254 skb_shinfo(skb)->nr_frags++;
@@ -400,14 +400,14 @@ static struct sk_buff *lro_gen_skb(struct net_lro_mgr *lro_mgr,
400 skb_frags = skb_shinfo(skb)->frags; 400 skb_frags = skb_shinfo(skb)->frags;
401 while (data_len > 0) { 401 while (data_len > 0) {
402 *skb_frags = *frags; 402 *skb_frags = *frags;
403 data_len -= skb_frag_size(frags); 403 data_len -= frags->size;
404 skb_frags++; 404 skb_frags++;
405 frags++; 405 frags++;
406 skb_shinfo(skb)->nr_frags++; 406 skb_shinfo(skb)->nr_frags++;
407 } 407 }
408 408
409 skb_shinfo(skb)->frags[0].page_offset += hdr_len; 409 skb_shinfo(skb)->frags[0].page_offset += hdr_len;
410 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], hdr_len); 410 skb_shinfo(skb)->frags[0].size -= hdr_len;
411 411
412 skb->ip_summed = ip_summed; 412 skb->ip_summed = ip_summed;
413 skb->csum = sum; 413 skb->csum = sum;
@@ -433,7 +433,7 @@ static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr,
433 if (!lro_mgr->get_frag_header || 433 if (!lro_mgr->get_frag_header ||
434 lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph, 434 lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph,
435 (void *)&tcph, &flags, priv)) { 435 (void *)&tcph, &flags, priv)) {
436 mac_hdr = skb_frag_address(frags); 436 mac_hdr = page_address(frags->page) + frags->page_offset;
437 goto out1; 437 goto out1;
438 } 438 }
439 439
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 2784db3155f..3c8dfa16614 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -11,7 +11,6 @@
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/kmemcheck.h> 12#include <linux/kmemcheck.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/module.h>
15#include <net/inet_hashtables.h> 14#include <net/inet_hashtables.h>
16#include <net/inet_timewait_sock.h> 15#include <net/inet_timewait_sock.h>
17#include <net/ip.h> 16#include <net/ip.h>
@@ -89,8 +88,8 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
89 88
90#ifdef SOCK_REFCNT_DEBUG 89#ifdef SOCK_REFCNT_DEBUG
91 if (atomic_read(&tw->tw_refcnt) != 1) { 90 if (atomic_read(&tw->tw_refcnt) != 1) {
92 pr_debug("%s timewait_sock %p refcnt=%d\n", 91 printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n",
93 tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); 92 tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
94 } 93 }
95#endif 94#endif
96 while (refcnt) { 95 while (refcnt) {
@@ -184,7 +183,6 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
184 tw->tw_daddr = inet->inet_daddr; 183 tw->tw_daddr = inet->inet_daddr;
185 tw->tw_rcv_saddr = inet->inet_rcv_saddr; 184 tw->tw_rcv_saddr = inet->inet_rcv_saddr;
186 tw->tw_bound_dev_if = sk->sk_bound_dev_if; 185 tw->tw_bound_dev_if = sk->sk_bound_dev_if;
187 tw->tw_tos = inet->tos;
188 tw->tw_num = inet->inet_num; 186 tw->tw_num = inet->inet_num;
189 tw->tw_state = TCP_TIME_WAIT; 187 tw->tw_state = TCP_TIME_WAIT;
190 tw->tw_substate = state; 188 tw->tw_substate = state;
@@ -263,7 +261,7 @@ rescan:
263void inet_twdr_hangman(unsigned long data) 261void inet_twdr_hangman(unsigned long data)
264{ 262{
265 struct inet_timewait_death_row *twdr; 263 struct inet_timewait_death_row *twdr;
266 unsigned int need_timer; 264 int unsigned need_timer;
267 265
268 twdr = (struct inet_timewait_death_row *)data; 266 twdr = (struct inet_timewait_death_row *)data;
269 spin_lock(&twdr->death_lock); 267 spin_lock(&twdr->death_lock);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 000e3d239d6..86f13c67ea8 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -17,7 +17,6 @@
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/mm.h> 18#include <linux/mm.h>
19#include <linux/net.h> 19#include <linux/net.h>
20#include <linux/workqueue.h>
21#include <net/ip.h> 20#include <net/ip.h>
22#include <net/inetpeer.h> 21#include <net/inetpeer.h>
23#include <net/secure_seq.h> 22#include <net/secure_seq.h>
@@ -67,11 +66,6 @@
67 66
68static struct kmem_cache *peer_cachep __read_mostly; 67static struct kmem_cache *peer_cachep __read_mostly;
69 68
70static LIST_HEAD(gc_list);
71static const int gc_delay = 60 * HZ;
72static struct delayed_work gc_work;
73static DEFINE_SPINLOCK(gc_lock);
74
75#define node_height(x) x->avl_height 69#define node_height(x) x->avl_height
76 70
77#define peer_avl_empty ((struct inet_peer *)&peer_fake_node) 71#define peer_avl_empty ((struct inet_peer *)&peer_fake_node)
@@ -82,39 +76,23 @@ static const struct inet_peer peer_fake_node = {
82 .avl_height = 0 76 .avl_height = 0
83}; 77};
84 78
85void inet_peer_base_init(struct inet_peer_base *bp) 79struct inet_peer_base {
86{ 80 struct inet_peer __rcu *root;
87 bp->root = peer_avl_empty_rcu; 81 seqlock_t lock;
88 seqlock_init(&bp->lock); 82 int total;
89 bp->flush_seq = ~0U; 83};
90 bp->total = 0;
91}
92EXPORT_SYMBOL_GPL(inet_peer_base_init);
93
94static atomic_t v4_seq = ATOMIC_INIT(0);
95static atomic_t v6_seq = ATOMIC_INIT(0);
96
97static atomic_t *inetpeer_seq_ptr(int family)
98{
99 return (family == AF_INET ? &v4_seq : &v6_seq);
100}
101
102static inline void flush_check(struct inet_peer_base *base, int family)
103{
104 atomic_t *fp = inetpeer_seq_ptr(family);
105
106 if (unlikely(base->flush_seq != atomic_read(fp))) {
107 inetpeer_invalidate_tree(base);
108 base->flush_seq = atomic_read(fp);
109 }
110}
111 84
112void inetpeer_invalidate_family(int family) 85static struct inet_peer_base v4_peers = {
113{ 86 .root = peer_avl_empty_rcu,
114 atomic_t *fp = inetpeer_seq_ptr(family); 87 .lock = __SEQLOCK_UNLOCKED(v4_peers.lock),
88 .total = 0,
89};
115 90
116 atomic_inc(fp); 91static struct inet_peer_base v6_peers = {
117} 92 .root = peer_avl_empty_rcu,
93 .lock = __SEQLOCK_UNLOCKED(v6_peers.lock),
94 .total = 0,
95};
118 96
119#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ 97#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
120 98
@@ -124,52 +102,6 @@ int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries m
124int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ 102int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */
125int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */ 103int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */
126 104
127static void inetpeer_gc_worker(struct work_struct *work)
128{
129 struct inet_peer *p, *n, *c;
130 LIST_HEAD(list);
131
132 spin_lock_bh(&gc_lock);
133 list_replace_init(&gc_list, &list);
134 spin_unlock_bh(&gc_lock);
135
136 if (list_empty(&list))
137 return;
138
139 list_for_each_entry_safe(p, n, &list, gc_list) {
140
141 if (need_resched())
142 cond_resched();
143
144 c = rcu_dereference_protected(p->avl_left, 1);
145 if (c != peer_avl_empty) {
146 list_add_tail(&c->gc_list, &list);
147 p->avl_left = peer_avl_empty_rcu;
148 }
149
150 c = rcu_dereference_protected(p->avl_right, 1);
151 if (c != peer_avl_empty) {
152 list_add_tail(&c->gc_list, &list);
153 p->avl_right = peer_avl_empty_rcu;
154 }
155
156 n = list_entry(p->gc_list.next, struct inet_peer, gc_list);
157
158 if (!atomic_read(&p->refcnt)) {
159 list_del(&p->gc_list);
160 kmem_cache_free(peer_cachep, p);
161 }
162 }
163
164 if (list_empty(&list))
165 return;
166
167 spin_lock_bh(&gc_lock);
168 list_splice(&list, &gc_list);
169 spin_unlock_bh(&gc_lock);
170
171 schedule_delayed_work(&gc_work, gc_delay);
172}
173 105
174/* Called from ip_output.c:ip_init */ 106/* Called from ip_output.c:ip_init */
175void __init inet_initpeers(void) 107void __init inet_initpeers(void)
@@ -194,7 +126,6 @@ void __init inet_initpeers(void)
194 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, 126 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
195 NULL); 127 NULL);
196 128
197 INIT_DEFERRABLE_WORK(&gc_work, inetpeer_gc_worker);
198} 129}
199 130
200static int addr_compare(const struct inetpeer_addr *a, 131static int addr_compare(const struct inetpeer_addr *a,
@@ -205,7 +136,7 @@ static int addr_compare(const struct inetpeer_addr *a,
205 for (i = 0; i < n; i++) { 136 for (i = 0; i < n; i++) {
206 if (a->addr.a6[i] == b->addr.a6[i]) 137 if (a->addr.a6[i] == b->addr.a6[i])
207 continue; 138 continue;
208 if ((__force u32)a->addr.a6[i] < (__force u32)b->addr.a6[i]) 139 if (a->addr.a6[i] < b->addr.a6[i])
209 return -1; 140 return -1;
210 return 1; 141 return 1;
211 } 142 }
@@ -419,6 +350,11 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base,
419 call_rcu(&p->rcu, inetpeer_free_rcu); 350 call_rcu(&p->rcu, inetpeer_free_rcu);
420} 351}
421 352
353static struct inet_peer_base *family_to_base(int family)
354{
355 return family == AF_INET ? &v4_peers : &v6_peers;
356}
357
422/* perform garbage collect on all items stacked during a lookup */ 358/* perform garbage collect on all items stacked during a lookup */
423static int inet_peer_gc(struct inet_peer_base *base, 359static int inet_peer_gc(struct inet_peer_base *base,
424 struct inet_peer __rcu **stack[PEER_MAXDEPTH], 360 struct inet_peer __rcu **stack[PEER_MAXDEPTH],
@@ -456,17 +392,14 @@ static int inet_peer_gc(struct inet_peer_base *base,
456 return cnt; 392 return cnt;
457} 393}
458 394
459struct inet_peer *inet_getpeer(struct inet_peer_base *base, 395struct inet_peer *inet_getpeer(const struct inetpeer_addr *daddr, int create)
460 const struct inetpeer_addr *daddr,
461 int create)
462{ 396{
463 struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; 397 struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
398 struct inet_peer_base *base = family_to_base(daddr->family);
464 struct inet_peer *p; 399 struct inet_peer *p;
465 unsigned int sequence; 400 unsigned int sequence;
466 int invalidated, gccnt = 0; 401 int invalidated, gccnt = 0;
467 402
468 flush_check(base, daddr->family);
469
470 /* Attempt a lockless lookup first. 403 /* Attempt a lockless lookup first.
471 * Because of a concurrent writer, we might not find an existing entry. 404 * Because of a concurrent writer, we might not find an existing entry.
472 */ 405 */
@@ -508,13 +441,14 @@ relookup:
508 (daddr->family == AF_INET) ? 441 (daddr->family == AF_INET) ?
509 secure_ip_id(daddr->addr.a4) : 442 secure_ip_id(daddr->addr.a4) :
510 secure_ipv6_id(daddr->addr.a6)); 443 secure_ipv6_id(daddr->addr.a6));
444 p->tcp_ts_stamp = 0;
511 p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; 445 p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
512 p->rate_tokens = 0; 446 p->rate_tokens = 0;
513 /* 60*HZ is arbitrary, but chosen enough high so that the first 447 p->rate_last = 0;
514 * calculation of tokens is at its maximum. 448 p->pmtu_expires = 0;
515 */ 449 p->pmtu_orig = 0;
516 p->rate_last = jiffies - 60*HZ; 450 memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
517 INIT_LIST_HEAD(&p->gc_list); 451
518 452
519 /* Link the node. */ 453 /* Link the node. */
520 link_to_pool(p, base); 454 link_to_pool(p, base);
@@ -574,31 +508,3 @@ bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
574 return rc; 508 return rc;
575} 509}
576EXPORT_SYMBOL(inet_peer_xrlim_allow); 510EXPORT_SYMBOL(inet_peer_xrlim_allow);
577
578static void inetpeer_inval_rcu(struct rcu_head *head)
579{
580 struct inet_peer *p = container_of(head, struct inet_peer, gc_rcu);
581
582 spin_lock_bh(&gc_lock);
583 list_add_tail(&p->gc_list, &gc_list);
584 spin_unlock_bh(&gc_lock);
585
586 schedule_delayed_work(&gc_work, gc_delay);
587}
588
589void inetpeer_invalidate_tree(struct inet_peer_base *base)
590{
591 struct inet_peer *root;
592
593 write_seqlock_bh(&base->lock);
594
595 root = rcu_deref_locked(base->root, base);
596 if (root != peer_avl_empty) {
597 base->root = peer_avl_empty_rcu;
598 base->total = 0;
599 call_rcu(&root->gc_rcu, inetpeer_inval_rcu);
600 }
601
602 write_sequnlock_bh(&base->lock);
603}
604EXPORT_SYMBOL(inetpeer_invalidate_tree);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 694de3b7aeb..3b34d1c8627 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -41,10 +41,9 @@
41 41
42static int ip_forward_finish(struct sk_buff *skb) 42static int ip_forward_finish(struct sk_buff *skb)
43{ 43{
44 struct ip_options *opt = &(IPCB(skb)->opt); 44 struct ip_options * opt = &(IPCB(skb)->opt);
45 45
46 IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); 46 IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
47 IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len);
48 47
49 if (unlikely(opt->optlen)) 48 if (unlikely(opt->optlen))
50 ip_forward_options(skb); 49 ip_forward_options(skb);
@@ -56,7 +55,7 @@ int ip_forward(struct sk_buff *skb)
56{ 55{
57 struct iphdr *iph; /* Our header */ 56 struct iphdr *iph; /* Our header */
58 struct rtable *rt; /* Route we use */ 57 struct rtable *rt; /* Route we use */
59 struct ip_options *opt = &(IPCB(skb)->opt); 58 struct ip_options * opt = &(IPCB(skb)->opt);
60 59
61 if (skb_warn_if_lro(skb)) 60 if (skb_warn_if_lro(skb))
62 goto drop; 61 goto drop;
@@ -85,7 +84,7 @@ int ip_forward(struct sk_buff *skb)
85 84
86 rt = skb_rtable(skb); 85 rt = skb_rtable(skb);
87 86
88 if (opt->is_strictroute && rt->rt_uses_gateway) 87 if (opt->is_strictroute && ip_hdr(skb)->daddr != rt->rt_gateway)
89 goto sr_failed; 88 goto sr_failed;
90 89
91 if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) && 90 if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) &&
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index eb9d63a570c..0e0ab98abc6 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -20,8 +20,6 @@
20 * Patrick McHardy : LRU queue of frag heads for evictor. 20 * Patrick McHardy : LRU queue of frag heads for evictor.
21 */ 21 */
22 22
23#define pr_fmt(fmt) "IPv4: " fmt
24
25#include <linux/compiler.h> 23#include <linux/compiler.h>
26#include <linux/module.h> 24#include <linux/module.h>
27#include <linux/types.h> 25#include <linux/types.h>
@@ -148,17 +146,17 @@ static unsigned int ip4_hashfn(struct inet_frag_queue *q)
148 return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol); 146 return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
149} 147}
150 148
151static bool ip4_frag_match(struct inet_frag_queue *q, void *a) 149static int ip4_frag_match(struct inet_frag_queue *q, void *a)
152{ 150{
153 struct ipq *qp; 151 struct ipq *qp;
154 struct ip4_create_arg *arg = a; 152 struct ip4_create_arg *arg = a;
155 153
156 qp = container_of(q, struct ipq, q); 154 qp = container_of(q, struct ipq, q);
157 return qp->id == arg->iph->id && 155 return qp->id == arg->iph->id &&
158 qp->saddr == arg->iph->saddr && 156 qp->saddr == arg->iph->saddr &&
159 qp->daddr == arg->iph->daddr && 157 qp->daddr == arg->iph->daddr &&
160 qp->protocol == arg->iph->protocol && 158 qp->protocol == arg->iph->protocol &&
161 qp->user == arg->user; 159 qp->user == arg->user;
162} 160}
163 161
164/* Memory Tracking Functions. */ 162/* Memory Tracking Functions. */
@@ -171,10 +169,6 @@ static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb)
171static void ip4_frag_init(struct inet_frag_queue *q, void *a) 169static void ip4_frag_init(struct inet_frag_queue *q, void *a)
172{ 170{
173 struct ipq *qp = container_of(q, struct ipq, q); 171 struct ipq *qp = container_of(q, struct ipq, q);
174 struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
175 frags);
176 struct net *net = container_of(ipv4, struct net, ipv4);
177
178 struct ip4_create_arg *arg = a; 172 struct ip4_create_arg *arg = a;
179 173
180 qp->protocol = arg->iph->protocol; 174 qp->protocol = arg->iph->protocol;
@@ -184,7 +178,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, void *a)
184 qp->daddr = arg->iph->daddr; 178 qp->daddr = arg->iph->daddr;
185 qp->user = arg->user; 179 qp->user = arg->user;
186 qp->peer = sysctl_ipfrag_max_dist ? 180 qp->peer = sysctl_ipfrag_max_dist ?
187 inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL; 181 inet_getpeer_v4(arg->iph->saddr, 1) : NULL;
188} 182}
189 183
190static __inline__ void ip4_frag_free(struct inet_frag_queue *q) 184static __inline__ void ip4_frag_free(struct inet_frag_queue *q)
@@ -219,7 +213,7 @@ static void ip_evictor(struct net *net)
219{ 213{
220 int evicted; 214 int evicted;
221 215
222 evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false); 216 evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags);
223 if (evicted) 217 if (evicted)
224 IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted); 218 IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);
225} 219}
@@ -305,7 +299,7 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
305 return container_of(q, struct ipq, q); 299 return container_of(q, struct ipq, q);
306 300
307out_nomem: 301out_nomem:
308 LIMIT_NETDEBUG(KERN_ERR pr_fmt("ip_frag_create: no memory left !\n")); 302 LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n");
309 return NULL; 303 return NULL;
310} 304}
311 305
@@ -398,7 +392,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
398 /* Is this the final fragment? */ 392 /* Is this the final fragment? */
399 if ((flags & IP_MF) == 0) { 393 if ((flags & IP_MF) == 0) {
400 /* If we already have some bits beyond end 394 /* If we already have some bits beyond end
401 * or have different end, the segment is corrupted. 395 * or have different end, the segment is corrrupted.
402 */ 396 */
403 if (end < qp->q.len || 397 if (end < qp->q.len ||
404 ((qp->q.last_in & INET_FRAG_LAST_IN) && end != qp->q.len)) 398 ((qp->q.last_in & INET_FRAG_LAST_IN) && end != qp->q.len))
@@ -523,10 +517,6 @@ found:
523 if (offset == 0) 517 if (offset == 0)
524 qp->q.last_in |= INET_FRAG_FIRST_IN; 518 qp->q.last_in |= INET_FRAG_FIRST_IN;
525 519
526 if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
527 skb->len + ihl > qp->q.max_size)
528 qp->q.max_size = skb->len + ihl;
529
530 if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && 520 if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
531 qp->q.meat == qp->q.len) 521 qp->q.meat == qp->q.len)
532 return ip_frag_reasm(qp, prev, dev); 522 return ip_frag_reasm(qp, prev, dev);
@@ -553,7 +543,6 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
553 int len; 543 int len;
554 int ihlen; 544 int ihlen;
555 int err; 545 int err;
556 int sum_truesize;
557 u8 ecn; 546 u8 ecn;
558 547
559 ipq_kill(qp); 548 ipq_kill(qp);
@@ -578,7 +567,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
578 skb_morph(head, qp->q.fragments); 567 skb_morph(head, qp->q.fragments);
579 head->next = qp->q.fragments->next; 568 head->next = qp->q.fragments->next;
580 569
581 consume_skb(qp->q.fragments); 570 kfree_skb(qp->q.fragments);
582 qp->q.fragments = head; 571 qp->q.fragments = head;
583 } 572 }
584 573
@@ -610,8 +599,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
610 head->next = clone; 599 head->next = clone;
611 skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; 600 skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
612 skb_frag_list_init(head); 601 skb_frag_list_init(head);
613 for (i = 0; i < skb_shinfo(head)->nr_frags; i++) 602 for (i=0; i<skb_shinfo(head)->nr_frags; i++)
614 plen += skb_frag_size(&skb_shinfo(head)->frags[i]); 603 plen += skb_shinfo(head)->frags[i].size;
615 clone->len = clone->data_len = head->data_len - plen; 604 clone->len = clone->data_len = head->data_len - plen;
616 head->data_len -= clone->len; 605 head->data_len -= clone->len;
617 head->len -= clone->len; 606 head->len -= clone->len;
@@ -620,41 +609,26 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
620 atomic_add(clone->truesize, &qp->q.net->mem); 609 atomic_add(clone->truesize, &qp->q.net->mem);
621 } 610 }
622 611
612 skb_shinfo(head)->frag_list = head->next;
623 skb_push(head, head->data - skb_network_header(head)); 613 skb_push(head, head->data - skb_network_header(head));
624 614
625 sum_truesize = head->truesize; 615 for (fp=head->next; fp; fp = fp->next) {
626 for (fp = head->next; fp;) { 616 head->data_len += fp->len;
627 bool headstolen; 617 head->len += fp->len;
628 int delta;
629 struct sk_buff *next = fp->next;
630
631 sum_truesize += fp->truesize;
632 if (head->ip_summed != fp->ip_summed) 618 if (head->ip_summed != fp->ip_summed)
633 head->ip_summed = CHECKSUM_NONE; 619 head->ip_summed = CHECKSUM_NONE;
634 else if (head->ip_summed == CHECKSUM_COMPLETE) 620 else if (head->ip_summed == CHECKSUM_COMPLETE)
635 head->csum = csum_add(head->csum, fp->csum); 621 head->csum = csum_add(head->csum, fp->csum);
636 622 head->truesize += fp->truesize;
637 if (skb_try_coalesce(head, fp, &headstolen, &delta)) {
638 kfree_skb_partial(fp, headstolen);
639 } else {
640 if (!skb_shinfo(head)->frag_list)
641 skb_shinfo(head)->frag_list = fp;
642 head->data_len += fp->len;
643 head->len += fp->len;
644 head->truesize += fp->truesize;
645 }
646 fp = next;
647 } 623 }
648 atomic_sub(sum_truesize, &qp->q.net->mem); 624 atomic_sub(head->truesize, &qp->q.net->mem);
649 625
650 head->next = NULL; 626 head->next = NULL;
651 head->dev = dev; 627 head->dev = dev;
652 head->tstamp = qp->q.stamp; 628 head->tstamp = qp->q.stamp;
653 IPCB(head)->frag_max_size = qp->q.max_size;
654 629
655 iph = ip_hdr(head); 630 iph = ip_hdr(head);
656 /* max_size != 0 implies at least one fragment had IP_DF set */ 631 iph->frag_off = 0;
657 iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0;
658 iph->tot_len = htons(len); 632 iph->tot_len = htons(len);
659 iph->tos |= ecn; 633 iph->tos |= ecn;
660 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); 634 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
@@ -663,12 +637,14 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
663 return 0; 637 return 0;
664 638
665out_nomem: 639out_nomem:
666 LIMIT_NETDEBUG(KERN_ERR pr_fmt("queue_glue: no memory for gluing queue %p\n"), 640 LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing "
667 qp); 641 "queue %p\n", qp);
668 err = -ENOMEM; 642 err = -ENOMEM;
669 goto out_fail; 643 goto out_fail;
670out_oversize: 644out_oversize:
671 net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr); 645 if (net_ratelimit())
646 printk(KERN_INFO "Oversized IP packet from %pI4.\n",
647 &qp->saddr);
672out_fail: 648out_fail:
673 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); 649 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
674 return err; 650 return err;
@@ -684,7 +660,8 @@ int ip_defrag(struct sk_buff *skb, u32 user)
684 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); 660 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
685 661
686 /* Start by cleaning up the memory. */ 662 /* Start by cleaning up the memory. */
687 ip_evictor(net); 663 if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh)
664 ip_evictor(net);
688 665
689 /* Lookup (or create) queue header */ 666 /* Lookup (or create) queue header */
690 if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) { 667 if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {
@@ -705,41 +682,6 @@ int ip_defrag(struct sk_buff *skb, u32 user)
705} 682}
706EXPORT_SYMBOL(ip_defrag); 683EXPORT_SYMBOL(ip_defrag);
707 684
708struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user)
709{
710 struct iphdr iph;
711 u32 len;
712
713 if (skb->protocol != htons(ETH_P_IP))
714 return skb;
715
716 if (!skb_copy_bits(skb, 0, &iph, sizeof(iph)))
717 return skb;
718
719 if (iph.ihl < 5 || iph.version != 4)
720 return skb;
721
722 len = ntohs(iph.tot_len);
723 if (skb->len < len || len < (iph.ihl * 4))
724 return skb;
725
726 if (ip_is_fragment(&iph)) {
727 skb = skb_share_check(skb, GFP_ATOMIC);
728 if (skb) {
729 if (!pskb_may_pull(skb, iph.ihl*4))
730 return skb;
731 if (pskb_trim_rcsum(skb, len))
732 return skb;
733 memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
734 if (ip_defrag(skb, user))
735 return NULL;
736 skb->rxhash = 0;
737 }
738 }
739 return skb;
740}
741EXPORT_SYMBOL(ip_check_defrag);
742
743#ifdef CONFIG_SYSCTL 685#ifdef CONFIG_SYSCTL
744static int zero; 686static int zero;
745 687
@@ -801,13 +743,9 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)
801 table[0].data = &net->ipv4.frags.high_thresh; 743 table[0].data = &net->ipv4.frags.high_thresh;
802 table[1].data = &net->ipv4.frags.low_thresh; 744 table[1].data = &net->ipv4.frags.low_thresh;
803 table[2].data = &net->ipv4.frags.timeout; 745 table[2].data = &net->ipv4.frags.timeout;
804
805 /* Don't export sysctls to unprivileged users */
806 if (net->user_ns != &init_user_ns)
807 table[0].procname = NULL;
808 } 746 }
809 747
810 hdr = register_net_sysctl(net, "net/ipv4", table); 748 hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table);
811 if (hdr == NULL) 749 if (hdr == NULL)
812 goto err_reg; 750 goto err_reg;
813 751
@@ -832,7 +770,7 @@ static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
832 770
833static void ip4_frags_ctl_register(void) 771static void ip4_frags_ctl_register(void)
834{ 772{
835 register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table); 773 register_net_sysctl_rotable(net_ipv4_ctl_path, ip4_frags_ctl_table);
836} 774}
837#else 775#else
838static inline int ip4_frags_ns_ctl_register(struct net *net) 776static inline int ip4_frags_ns_ctl_register(struct net *net)
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 303012adf9e..d7bb94c4834 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -10,8 +10,6 @@
10 * 10 *
11 */ 11 */
12 12
13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14
15#include <linux/capability.h> 13#include <linux/capability.h>
16#include <linux/module.h> 14#include <linux/module.h>
17#include <linux/types.h> 15#include <linux/types.h>
@@ -48,7 +46,7 @@
48#include <net/rtnetlink.h> 46#include <net/rtnetlink.h>
49#include <net/gre.h> 47#include <net/gre.h>
50 48
51#if IS_ENABLED(CONFIG_IPV6) 49#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
52#include <net/ipv6.h> 50#include <net/ipv6.h>
53#include <net/ip6_fib.h> 51#include <net/ip6_fib.h>
54#include <net/ip6_route.h> 52#include <net/ip6_route.h>
@@ -67,7 +65,7 @@
67 it is infeasible task. The most general solutions would be 65 it is infeasible task. The most general solutions would be
68 to keep skb->encapsulation counter (sort of local ttl), 66 to keep skb->encapsulation counter (sort of local ttl),
69 and silently drop packet when it expires. It is a good 67 and silently drop packet when it expires. It is a good
70 solution, but it supposes maintaining new variable in ALL 68 solution, but it supposes maintaing new variable in ALL
71 skb, even if no tunneling is used. 69 skb, even if no tunneling is used.
72 70
73 Current solution: xmit_recursion breaks dead loops. This is a percpu 71 Current solution: xmit_recursion breaks dead loops. This is a percpu
@@ -93,14 +91,14 @@
93 91
94 One of them is to parse packet trying to detect inner encapsulation 92 One of them is to parse packet trying to detect inner encapsulation
95 made by our node. It is difficult or even impossible, especially, 93 made by our node. It is difficult or even impossible, especially,
96 taking into account fragmentation. TO be short, ttl is not solution at all. 94 taking into account fragmentation. TO be short, tt is not solution at all.
97 95
98 Current solution: The solution was UNEXPECTEDLY SIMPLE. 96 Current solution: The solution was UNEXPECTEDLY SIMPLE.
99 We force DF flag on tunnels with preconfigured hop limit, 97 We force DF flag on tunnels with preconfigured hop limit,
100 that is ALL. :-) Well, it does not remove the problem completely, 98 that is ALL. :-) Well, it does not remove the problem completely,
101 but exponential growth of network traffic is changed to linear 99 but exponential growth of network traffic is changed to linear
102 (branches, that exceed pmtu are pruned) and tunnel mtu 100 (branches, that exceed pmtu are pruned) and tunnel mtu
103 rapidly degrades to value <68, where looping stops. 101 fastly degrades to value <68, where looping stops.
104 Yes, it is not good if there exists a router in the loop, 102 Yes, it is not good if there exists a router in the loop,
105 which does not force DF, even when encapsulating packets have DF set. 103 which does not force DF, even when encapsulating packets have DF set.
106 But it is not our problem! Nobody could accuse us, we made 104 But it is not our problem! Nobody could accuse us, we made
@@ -120,10 +118,6 @@
120 Alexey Kuznetsov. 118 Alexey Kuznetsov.
121 */ 119 */
122 120
123static bool log_ecn_error = true;
124module_param(log_ecn_error, bool, 0644);
125MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
126
127static struct rtnl_link_ops ipgre_link_ops __read_mostly; 121static struct rtnl_link_ops ipgre_link_ops __read_mostly;
128static int ipgre_tunnel_init(struct net_device *dev); 122static int ipgre_tunnel_init(struct net_device *dev);
129static void ipgre_tunnel_setup(struct net_device *dev); 123static void ipgre_tunnel_setup(struct net_device *dev);
@@ -164,66 +158,46 @@ struct ipgre_net {
164#define tunnels_r tunnels[2] 158#define tunnels_r tunnels[2]
165#define tunnels_l tunnels[1] 159#define tunnels_l tunnels[1]
166#define tunnels_wc tunnels[0] 160#define tunnels_wc tunnels[0]
161/*
162 * Locking : hash tables are protected by RCU and RTNL
163 */
164
165#define for_each_ip_tunnel_rcu(start) \
166 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
167 167
168static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev, 168/* often modified stats are per cpu, other are shared (netdev->stats) */
169 struct rtnl_link_stats64 *tot) 169struct pcpu_tstats {
170 unsigned long rx_packets;
171 unsigned long rx_bytes;
172 unsigned long tx_packets;
173 unsigned long tx_bytes;
174};
175
176static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
170{ 177{
178 struct pcpu_tstats sum = { 0 };
171 int i; 179 int i;
172 180
173 for_each_possible_cpu(i) { 181 for_each_possible_cpu(i) {
174 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); 182 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
175 u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
176 unsigned int start;
177
178 do {
179 start = u64_stats_fetch_begin_bh(&tstats->syncp);
180 rx_packets = tstats->rx_packets;
181 tx_packets = tstats->tx_packets;
182 rx_bytes = tstats->rx_bytes;
183 tx_bytes = tstats->tx_bytes;
184 } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
185
186 tot->rx_packets += rx_packets;
187 tot->tx_packets += tx_packets;
188 tot->rx_bytes += rx_bytes;
189 tot->tx_bytes += tx_bytes;
190 }
191
192 tot->multicast = dev->stats.multicast;
193 tot->rx_crc_errors = dev->stats.rx_crc_errors;
194 tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
195 tot->rx_length_errors = dev->stats.rx_length_errors;
196 tot->rx_frame_errors = dev->stats.rx_frame_errors;
197 tot->rx_errors = dev->stats.rx_errors;
198
199 tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
200 tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
201 tot->tx_dropped = dev->stats.tx_dropped;
202 tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
203 tot->tx_errors = dev->stats.tx_errors;
204 183
205 return tot; 184 sum.rx_packets += tstats->rx_packets;
206} 185 sum.rx_bytes += tstats->rx_bytes;
207 186 sum.tx_packets += tstats->tx_packets;
208/* Does key in tunnel parameters match packet */ 187 sum.tx_bytes += tstats->tx_bytes;
209static bool ipgre_key_match(const struct ip_tunnel_parm *p, 188 }
210 __be16 flags, __be32 key) 189 dev->stats.rx_packets = sum.rx_packets;
211{ 190 dev->stats.rx_bytes = sum.rx_bytes;
212 if (p->i_flags & GRE_KEY) { 191 dev->stats.tx_packets = sum.tx_packets;
213 if (flags & GRE_KEY) 192 dev->stats.tx_bytes = sum.tx_bytes;
214 return key == p->i_key; 193 return &dev->stats;
215 else
216 return false; /* key expected, none present */
217 } else
218 return !(flags & GRE_KEY);
219} 194}
220 195
221/* Given src, dst and key, find appropriate for input tunnel. */ 196/* Given src, dst and key, find appropriate for input tunnel. */
222 197
223static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev, 198static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
224 __be32 remote, __be32 local, 199 __be32 remote, __be32 local,
225 __be16 flags, __be32 key, 200 __be32 key, __be16 gre_proto)
226 __be16 gre_proto)
227{ 201{
228 struct net *net = dev_net(dev); 202 struct net *net = dev_net(dev);
229 int link = dev->ifindex; 203 int link = dev->ifindex;
@@ -235,15 +209,13 @@ static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
235 ARPHRD_ETHER : ARPHRD_IPGRE; 209 ARPHRD_ETHER : ARPHRD_IPGRE;
236 int score, cand_score = 4; 210 int score, cand_score = 4;
237 211
238 for_each_ip_tunnel_rcu(t, ign->tunnels_r_l[h0 ^ h1]) { 212 for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
239 if (local != t->parms.iph.saddr || 213 if (local != t->parms.iph.saddr ||
240 remote != t->parms.iph.daddr || 214 remote != t->parms.iph.daddr ||
215 key != t->parms.i_key ||
241 !(t->dev->flags & IFF_UP)) 216 !(t->dev->flags & IFF_UP))
242 continue; 217 continue;
243 218
244 if (!ipgre_key_match(&t->parms, flags, key))
245 continue;
246
247 if (t->dev->type != ARPHRD_IPGRE && 219 if (t->dev->type != ARPHRD_IPGRE &&
248 t->dev->type != dev_type) 220 t->dev->type != dev_type)
249 continue; 221 continue;
@@ -262,14 +234,12 @@ static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
262 } 234 }
263 } 235 }
264 236
265 for_each_ip_tunnel_rcu(t, ign->tunnels_r[h0 ^ h1]) { 237 for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
266 if (remote != t->parms.iph.daddr || 238 if (remote != t->parms.iph.daddr ||
239 key != t->parms.i_key ||
267 !(t->dev->flags & IFF_UP)) 240 !(t->dev->flags & IFF_UP))
268 continue; 241 continue;
269 242
270 if (!ipgre_key_match(&t->parms, flags, key))
271 continue;
272
273 if (t->dev->type != ARPHRD_IPGRE && 243 if (t->dev->type != ARPHRD_IPGRE &&
274 t->dev->type != dev_type) 244 t->dev->type != dev_type)
275 continue; 245 continue;
@@ -288,16 +258,14 @@ static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
288 } 258 }
289 } 259 }
290 260
291 for_each_ip_tunnel_rcu(t, ign->tunnels_l[h1]) { 261 for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
292 if ((local != t->parms.iph.saddr && 262 if ((local != t->parms.iph.saddr &&
293 (local != t->parms.iph.daddr || 263 (local != t->parms.iph.daddr ||
294 !ipv4_is_multicast(local))) || 264 !ipv4_is_multicast(local))) ||
265 key != t->parms.i_key ||
295 !(t->dev->flags & IFF_UP)) 266 !(t->dev->flags & IFF_UP))
296 continue; 267 continue;
297 268
298 if (!ipgre_key_match(&t->parms, flags, key))
299 continue;
300
301 if (t->dev->type != ARPHRD_IPGRE && 269 if (t->dev->type != ARPHRD_IPGRE &&
302 t->dev->type != dev_type) 270 t->dev->type != dev_type)
303 continue; 271 continue;
@@ -316,7 +284,7 @@ static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
316 } 284 }
317 } 285 }
318 286
319 for_each_ip_tunnel_rcu(t, ign->tunnels_wc[h1]) { 287 for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
320 if (t->parms.i_key != key || 288 if (t->parms.i_key != key ||
321 !(t->dev->flags & IFF_UP)) 289 !(t->dev->flags & IFF_UP))
322 continue; 290 continue;
@@ -454,10 +422,6 @@ static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
454 if (register_netdevice(dev) < 0) 422 if (register_netdevice(dev) < 0)
455 goto failed_free; 423 goto failed_free;
456 424
457 /* Can use a lockless transmit, unless we generate output sequences */
458 if (!(nt->parms.o_flags & GRE_SEQ))
459 dev->features |= NETIF_F_LLTX;
460
461 dev_hold(dev); 425 dev_hold(dev);
462 ipgre_tunnel_link(ign, nt); 426 ipgre_tunnel_link(ign, nt);
463 return nt; 427 return nt;
@@ -489,18 +453,17 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
489 GRE tunnels with enabled checksum. Tell them "thank you". 453 GRE tunnels with enabled checksum. Tell them "thank you".
490 454
491 Well, I wonder, rfc1812 was written by Cisco employee, 455 Well, I wonder, rfc1812 was written by Cisco employee,
492 what the hell these idiots break standards established 456 what the hell these idiots break standrads established
493 by themselves??? 457 by themself???
494 */ 458 */
495 459
496 const struct iphdr *iph = (const struct iphdr *)skb->data; 460 const struct iphdr *iph = (const struct iphdr *)skb->data;
497 __be16 *p = (__be16 *)(skb->data+(iph->ihl<<2)); 461 __be16 *p = (__be16*)(skb->data+(iph->ihl<<2));
498 int grehlen = (iph->ihl<<2) + 4; 462 int grehlen = (iph->ihl<<2) + 4;
499 const int type = icmp_hdr(skb)->type; 463 const int type = icmp_hdr(skb)->type;
500 const int code = icmp_hdr(skb)->code; 464 const int code = icmp_hdr(skb)->code;
501 struct ip_tunnel *t; 465 struct ip_tunnel *t;
502 __be16 flags; 466 __be16 flags;
503 __be32 key = 0;
504 467
505 flags = p[0]; 468 flags = p[0];
506 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) { 469 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
@@ -517,9 +480,6 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
517 if (skb_headlen(skb) < grehlen) 480 if (skb_headlen(skb) < grehlen)
518 return; 481 return;
519 482
520 if (flags & GRE_KEY)
521 key = *(((__be32 *)p) + (grehlen / 4) - 1);
522
523 switch (type) { 483 switch (type) {
524 default: 484 default:
525 case ICMP_PARAMETERPROB: 485 case ICMP_PARAMETERPROB:
@@ -531,6 +491,9 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
531 case ICMP_PORT_UNREACH: 491 case ICMP_PORT_UNREACH:
532 /* Impossible event. */ 492 /* Impossible event. */
533 return; 493 return;
494 case ICMP_FRAG_NEEDED:
495 /* Soft state for pmtu is maintained by IP core. */
496 return;
534 default: 497 default:
535 /* All others are translated to HOST_UNREACH. 498 /* All others are translated to HOST_UNREACH.
536 rfc2003 contains "deep thoughts" about NET_UNREACH, 499 rfc2003 contains "deep thoughts" about NET_UNREACH,
@@ -543,39 +506,38 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
543 if (code != ICMP_EXC_TTL) 506 if (code != ICMP_EXC_TTL)
544 return; 507 return;
545 break; 508 break;
546
547 case ICMP_REDIRECT:
548 break;
549 } 509 }
550 510
511 rcu_read_lock();
551 t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr, 512 t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
552 flags, key, p[1]); 513 flags & GRE_KEY ?
553 514 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
554 if (t == NULL) 515 p[1]);
555 return; 516 if (t == NULL || t->parms.iph.daddr == 0 ||
556
557 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
558 ipv4_update_pmtu(skb, dev_net(skb->dev), info,
559 t->parms.link, 0, IPPROTO_GRE, 0);
560 return;
561 }
562 if (type == ICMP_REDIRECT) {
563 ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
564 IPPROTO_GRE, 0);
565 return;
566 }
567 if (t->parms.iph.daddr == 0 ||
568 ipv4_is_multicast(t->parms.iph.daddr)) 517 ipv4_is_multicast(t->parms.iph.daddr))
569 return; 518 goto out;
570 519
571 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) 520 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
572 return; 521 goto out;
573 522
574 if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) 523 if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
575 t->err_count++; 524 t->err_count++;
576 else 525 else
577 t->err_count = 1; 526 t->err_count = 1;
578 t->err_time = jiffies; 527 t->err_time = jiffies;
528out:
529 rcu_read_unlock();
530}
531
532static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
533{
534 if (INET_ECN_is_ce(iph->tos)) {
535 if (skb->protocol == htons(ETH_P_IP)) {
536 IP_ECN_set_ce(ip_hdr(skb));
537 } else if (skb->protocol == htons(ETH_P_IPV6)) {
538 IP6_ECN_set_ce(ipv6_hdr(skb));
539 }
540 }
579} 541}
580 542
581static inline u8 543static inline u8
@@ -600,21 +562,20 @@ static int ipgre_rcv(struct sk_buff *skb)
600 struct ip_tunnel *tunnel; 562 struct ip_tunnel *tunnel;
601 int offset = 4; 563 int offset = 4;
602 __be16 gre_proto; 564 __be16 gre_proto;
603 int err;
604 565
605 if (!pskb_may_pull(skb, 16)) 566 if (!pskb_may_pull(skb, 16))
606 goto drop; 567 goto drop_nolock;
607 568
608 iph = ip_hdr(skb); 569 iph = ip_hdr(skb);
609 h = skb->data; 570 h = skb->data;
610 flags = *(__be16 *)h; 571 flags = *(__be16*)h;
611 572
612 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) { 573 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
613 /* - Version must be 0. 574 /* - Version must be 0.
614 - We do not support routing headers. 575 - We do not support routing headers.
615 */ 576 */
616 if (flags&(GRE_VERSION|GRE_ROUTING)) 577 if (flags&(GRE_VERSION|GRE_ROUTING))
617 goto drop; 578 goto drop_nolock;
618 579
619 if (flags&GRE_CSUM) { 580 if (flags&GRE_CSUM) {
620 switch (skb->ip_summed) { 581 switch (skb->ip_summed) {
@@ -631,21 +592,21 @@ static int ipgre_rcv(struct sk_buff *skb)
631 offset += 4; 592 offset += 4;
632 } 593 }
633 if (flags&GRE_KEY) { 594 if (flags&GRE_KEY) {
634 key = *(__be32 *)(h + offset); 595 key = *(__be32*)(h + offset);
635 offset += 4; 596 offset += 4;
636 } 597 }
637 if (flags&GRE_SEQ) { 598 if (flags&GRE_SEQ) {
638 seqno = ntohl(*(__be32 *)(h + offset)); 599 seqno = ntohl(*(__be32*)(h + offset));
639 offset += 4; 600 offset += 4;
640 } 601 }
641 } 602 }
642 603
643 gre_proto = *(__be16 *)(h + 2); 604 gre_proto = *(__be16 *)(h + 2);
644 605
645 tunnel = ipgre_tunnel_lookup(skb->dev, 606 rcu_read_lock();
646 iph->saddr, iph->daddr, flags, key, 607 if ((tunnel = ipgre_tunnel_lookup(skb->dev,
647 gre_proto); 608 iph->saddr, iph->daddr, key,
648 if (tunnel) { 609 gre_proto))) {
649 struct pcpu_tstats *tstats; 610 struct pcpu_tstats *tstats;
650 611
651 secpath_reset(skb); 612 secpath_reset(skb);
@@ -704,33 +665,25 @@ static int ipgre_rcv(struct sk_buff *skb)
704 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 665 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
705 } 666 }
706 667
668 tstats = this_cpu_ptr(tunnel->dev->tstats);
669 tstats->rx_packets++;
670 tstats->rx_bytes += skb->len;
671
707 __skb_tunnel_rx(skb, tunnel->dev); 672 __skb_tunnel_rx(skb, tunnel->dev);
708 673
709 skb_reset_network_header(skb); 674 skb_reset_network_header(skb);
710 err = IP_ECN_decapsulate(iph, skb); 675 ipgre_ecn_decapsulate(iph, skb);
711 if (unlikely(err)) {
712 if (log_ecn_error)
713 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
714 &iph->saddr, iph->tos);
715 if (err > 1) {
716 ++tunnel->dev->stats.rx_frame_errors;
717 ++tunnel->dev->stats.rx_errors;
718 goto drop;
719 }
720 }
721 676
722 tstats = this_cpu_ptr(tunnel->dev->tstats); 677 netif_rx(skb);
723 u64_stats_update_begin(&tstats->syncp);
724 tstats->rx_packets++;
725 tstats->rx_bytes += skb->len;
726 u64_stats_update_end(&tstats->syncp);
727 678
728 gro_cells_receive(&tunnel->gro_cells, skb); 679 rcu_read_unlock();
729 return 0; 680 return 0;
730 } 681 }
731 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); 682 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
732 683
733drop: 684drop:
685 rcu_read_unlock();
686drop_nolock:
734 kfree_skb(skb); 687 kfree_skb(skb);
735 return 0; 688 return 0;
736} 689}
@@ -738,6 +691,7 @@ drop:
738static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) 691static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
739{ 692{
740 struct ip_tunnel *tunnel = netdev_priv(dev); 693 struct ip_tunnel *tunnel = netdev_priv(dev);
694 struct pcpu_tstats *tstats;
741 const struct iphdr *old_iph = ip_hdr(skb); 695 const struct iphdr *old_iph = ip_hdr(skb);
742 const struct iphdr *tiph; 696 const struct iphdr *tiph;
743 struct flowi4 fl4; 697 struct flowi4 fl4;
@@ -750,21 +704,13 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
750 int gre_hlen; 704 int gre_hlen;
751 __be32 dst; 705 __be32 dst;
752 int mtu; 706 int mtu;
753 u8 ttl;
754
755 if (skb->ip_summed == CHECKSUM_PARTIAL &&
756 skb_checksum_help(skb))
757 goto tx_error;
758 707
759 if (dev->type == ARPHRD_ETHER) 708 if (dev->type == ARPHRD_ETHER)
760 IPCB(skb)->flags = 0; 709 IPCB(skb)->flags = 0;
761 710
762 if (dev->header_ops && dev->type == ARPHRD_IPGRE) { 711 if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
763 gre_hlen = 0; 712 gre_hlen = 0;
764 if (skb->protocol == htons(ETH_P_IP)) 713 tiph = (const struct iphdr *)skb->data;
765 tiph = (const struct iphdr *)skb->data;
766 else
767 tiph = &tunnel->parms.iph;
768 } else { 714 } else {
769 gre_hlen = tunnel->hlen; 715 gre_hlen = tunnel->hlen;
770 tiph = &tunnel->parms.iph; 716 tiph = &tunnel->parms.iph;
@@ -780,16 +726,15 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
780 726
781 if (skb->protocol == htons(ETH_P_IP)) { 727 if (skb->protocol == htons(ETH_P_IP)) {
782 rt = skb_rtable(skb); 728 rt = skb_rtable(skb);
783 dst = rt_nexthop(rt, old_iph->daddr); 729 if ((dst = rt->rt_gateway) == 0)
730 goto tx_error_icmp;
784 } 731 }
785#if IS_ENABLED(CONFIG_IPV6) 732#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
786 else if (skb->protocol == htons(ETH_P_IPV6)) { 733 else if (skb->protocol == htons(ETH_P_IPV6)) {
734 struct neighbour *neigh = dst_get_neighbour(skb_dst(skb));
787 const struct in6_addr *addr6; 735 const struct in6_addr *addr6;
788 struct neighbour *neigh;
789 bool do_tx_error_icmp;
790 int addr_type; 736 int addr_type;
791 737
792 neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr);
793 if (neigh == NULL) 738 if (neigh == NULL)
794 goto tx_error; 739 goto tx_error;
795 740
@@ -802,21 +747,15 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
802 } 747 }
803 748
804 if ((addr_type & IPV6_ADDR_COMPATv4) == 0) 749 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
805 do_tx_error_icmp = true;
806 else {
807 do_tx_error_icmp = false;
808 dst = addr6->s6_addr32[3];
809 }
810 neigh_release(neigh);
811 if (do_tx_error_icmp)
812 goto tx_error_icmp; 750 goto tx_error_icmp;
751
752 dst = addr6->s6_addr32[3];
813 } 753 }
814#endif 754#endif
815 else 755 else
816 goto tx_error; 756 goto tx_error;
817 } 757 }
818 758
819 ttl = tiph->ttl;
820 tos = tiph->tos; 759 tos = tiph->tos;
821 if (tos == 1) { 760 if (tos == 1) {
822 tos = 0; 761 tos = 0;
@@ -848,7 +787,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
848 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; 787 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
849 788
850 if (skb_dst(skb)) 789 if (skb_dst(skb))
851 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); 790 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
852 791
853 if (skb->protocol == htons(ETH_P_IP)) { 792 if (skb->protocol == htons(ETH_P_IP)) {
854 df |= (old_iph->frag_off&htons(IP_DF)); 793 df |= (old_iph->frag_off&htons(IP_DF));
@@ -860,7 +799,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
860 goto tx_error; 799 goto tx_error;
861 } 800 }
862 } 801 }
863#if IS_ENABLED(CONFIG_IPV6) 802#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
864 else if (skb->protocol == htons(ETH_P_IPV6)) { 803 else if (skb->protocol == htons(ETH_P_IPV6)) {
865 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); 804 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
866 805
@@ -909,12 +848,11 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
909 dev_kfree_skb(skb); 848 dev_kfree_skb(skb);
910 skb = new_skb; 849 skb = new_skb;
911 old_iph = ip_hdr(skb); 850 old_iph = ip_hdr(skb);
912 /* Warning : tiph value might point to freed memory */
913 } 851 }
914 852
853 skb_reset_transport_header(skb);
915 skb_push(skb, gre_hlen); 854 skb_push(skb, gre_hlen);
916 skb_reset_network_header(skb); 855 skb_reset_network_header(skb);
917 skb_set_transport_header(skb, sizeof(*iph));
918 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 856 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
919 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | 857 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
920 IPSKB_REROUTED); 858 IPSKB_REROUTED);
@@ -933,12 +871,11 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
933 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb); 871 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
934 iph->daddr = fl4.daddr; 872 iph->daddr = fl4.daddr;
935 iph->saddr = fl4.saddr; 873 iph->saddr = fl4.saddr;
936 iph->ttl = ttl;
937 874
938 if (ttl == 0) { 875 if ((iph->ttl = tiph->ttl) == 0) {
939 if (skb->protocol == htons(ETH_P_IP)) 876 if (skb->protocol == htons(ETH_P_IP))
940 iph->ttl = old_iph->ttl; 877 iph->ttl = old_iph->ttl;
941#if IS_ENABLED(CONFIG_IPV6) 878#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
942 else if (skb->protocol == htons(ETH_P_IPV6)) 879 else if (skb->protocol == htons(ETH_P_IPV6))
943 iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit; 880 iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
944#endif 881#endif
@@ -951,7 +888,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
951 htons(ETH_P_TEB) : skb->protocol; 888 htons(ETH_P_TEB) : skb->protocol;
952 889
953 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) { 890 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
954 __be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4); 891 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
955 892
956 if (tunnel->parms.o_flags&GRE_SEQ) { 893 if (tunnel->parms.o_flags&GRE_SEQ) {
957 ++tunnel->o_seqno; 894 ++tunnel->o_seqno;
@@ -964,17 +901,18 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
964 } 901 }
965 if (tunnel->parms.o_flags&GRE_CSUM) { 902 if (tunnel->parms.o_flags&GRE_CSUM) {
966 *ptr = 0; 903 *ptr = 0;
967 *(__sum16 *)ptr = ip_compute_csum((void *)(iph+1), skb->len - sizeof(struct iphdr)); 904 *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
968 } 905 }
969 } 906 }
970 907
971 iptunnel_xmit(skb, dev); 908 nf_reset(skb);
909 tstats = this_cpu_ptr(dev->tstats);
910 __IPTUNNEL_XMIT(tstats, &dev->stats);
972 return NETDEV_TX_OK; 911 return NETDEV_TX_OK;
973 912
974#if IS_ENABLED(CONFIG_IPV6)
975tx_error_icmp: 913tx_error_icmp:
976 dst_link_failure(skb); 914 dst_link_failure(skb);
977#endif 915
978tx_error: 916tx_error:
979 dev->stats.tx_errors++; 917 dev->stats.tx_errors++;
980 dev_kfree_skb(skb); 918 dev_kfree_skb(skb);
@@ -1071,7 +1009,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1071 case SIOCADDTUNNEL: 1009 case SIOCADDTUNNEL:
1072 case SIOCCHGTUNNEL: 1010 case SIOCCHGTUNNEL:
1073 err = -EPERM; 1011 err = -EPERM;
1074 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 1012 if (!capable(CAP_NET_ADMIN))
1075 goto done; 1013 goto done;
1076 1014
1077 err = -EFAULT; 1015 err = -EFAULT;
@@ -1146,7 +1084,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1146 1084
1147 case SIOCDELTUNNEL: 1085 case SIOCDELTUNNEL:
1148 err = -EPERM; 1086 err = -EPERM;
1149 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 1087 if (!capable(CAP_NET_ADMIN))
1150 goto done; 1088 goto done;
1151 1089
1152 if (dev == ign->fb_tunnel_dev) { 1090 if (dev == ign->fb_tunnel_dev) {
@@ -1218,7 +1156,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1218{ 1156{
1219 struct ip_tunnel *t = netdev_priv(dev); 1157 struct ip_tunnel *t = netdev_priv(dev);
1220 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); 1158 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1221 __be16 *p = (__be16 *)(iph+1); 1159 __be16 *p = (__be16*)(iph+1);
1222 1160
1223 memcpy(iph, &t->parms.iph, sizeof(struct iphdr)); 1161 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1224 p[0] = t->parms.o_flags; 1162 p[0] = t->parms.o_flags;
@@ -1302,23 +1240,15 @@ static const struct net_device_ops ipgre_netdev_ops = {
1302 .ndo_start_xmit = ipgre_tunnel_xmit, 1240 .ndo_start_xmit = ipgre_tunnel_xmit,
1303 .ndo_do_ioctl = ipgre_tunnel_ioctl, 1241 .ndo_do_ioctl = ipgre_tunnel_ioctl,
1304 .ndo_change_mtu = ipgre_tunnel_change_mtu, 1242 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1305 .ndo_get_stats64 = ipgre_get_stats64, 1243 .ndo_get_stats = ipgre_get_stats,
1306}; 1244};
1307 1245
1308static void ipgre_dev_free(struct net_device *dev) 1246static void ipgre_dev_free(struct net_device *dev)
1309{ 1247{
1310 struct ip_tunnel *tunnel = netdev_priv(dev);
1311
1312 gro_cells_destroy(&tunnel->gro_cells);
1313 free_percpu(dev->tstats); 1248 free_percpu(dev->tstats);
1314 free_netdev(dev); 1249 free_netdev(dev);
1315} 1250}
1316 1251
1317#define GRE_FEATURES (NETIF_F_SG | \
1318 NETIF_F_FRAGLIST | \
1319 NETIF_F_HIGHDMA | \
1320 NETIF_F_HW_CSUM)
1321
1322static void ipgre_tunnel_setup(struct net_device *dev) 1252static void ipgre_tunnel_setup(struct net_device *dev)
1323{ 1253{
1324 dev->netdev_ops = &ipgre_netdev_ops; 1254 dev->netdev_ops = &ipgre_netdev_ops;
@@ -1332,16 +1262,12 @@ static void ipgre_tunnel_setup(struct net_device *dev)
1332 dev->addr_len = 4; 1262 dev->addr_len = 4;
1333 dev->features |= NETIF_F_NETNS_LOCAL; 1263 dev->features |= NETIF_F_NETNS_LOCAL;
1334 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; 1264 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
1335
1336 dev->features |= GRE_FEATURES;
1337 dev->hw_features |= GRE_FEATURES;
1338} 1265}
1339 1266
1340static int ipgre_tunnel_init(struct net_device *dev) 1267static int ipgre_tunnel_init(struct net_device *dev)
1341{ 1268{
1342 struct ip_tunnel *tunnel; 1269 struct ip_tunnel *tunnel;
1343 struct iphdr *iph; 1270 struct iphdr *iph;
1344 int err;
1345 1271
1346 tunnel = netdev_priv(dev); 1272 tunnel = netdev_priv(dev);
1347 iph = &tunnel->parms.iph; 1273 iph = &tunnel->parms.iph;
@@ -1368,12 +1294,6 @@ static int ipgre_tunnel_init(struct net_device *dev)
1368 if (!dev->tstats) 1294 if (!dev->tstats)
1369 return -ENOMEM; 1295 return -ENOMEM;
1370 1296
1371 err = gro_cells_init(&tunnel->gro_cells, dev);
1372 if (err) {
1373 free_percpu(dev->tstats);
1374 return err;
1375 }
1376
1377 return 0; 1297 return 0;
1378} 1298}
1379 1299
@@ -1574,7 +1494,7 @@ static const struct net_device_ops ipgre_tap_netdev_ops = {
1574 .ndo_set_mac_address = eth_mac_addr, 1494 .ndo_set_mac_address = eth_mac_addr,
1575 .ndo_validate_addr = eth_validate_addr, 1495 .ndo_validate_addr = eth_validate_addr,
1576 .ndo_change_mtu = ipgre_tunnel_change_mtu, 1496 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1577 .ndo_get_stats64 = ipgre_get_stats64, 1497 .ndo_get_stats = ipgre_get_stats,
1578}; 1498};
1579 1499
1580static void ipgre_tap_setup(struct net_device *dev) 1500static void ipgre_tap_setup(struct net_device *dev)
@@ -1605,7 +1525,7 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nla
1605 return -EEXIST; 1525 return -EEXIST;
1606 1526
1607 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) 1527 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1608 eth_hw_addr_random(dev); 1528 random_ether_addr(dev->dev_addr);
1609 1529
1610 mtu = ipgre_tunnel_bind_dev(dev); 1530 mtu = ipgre_tunnel_bind_dev(dev);
1611 if (!tb[IFLA_MTU]) 1531 if (!tb[IFLA_MTU])
@@ -1721,18 +1641,17 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1721 struct ip_tunnel *t = netdev_priv(dev); 1641 struct ip_tunnel *t = netdev_priv(dev);
1722 struct ip_tunnel_parm *p = &t->parms; 1642 struct ip_tunnel_parm *p = &t->parms;
1723 1643
1724 if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || 1644 NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1725 nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) || 1645 NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1726 nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) || 1646 NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1727 nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || 1647 NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1728 nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || 1648 NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1729 nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) || 1649 NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1730 nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) || 1650 NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1731 nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) || 1651 NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1732 nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) || 1652 NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1733 nla_put_u8(skb, IFLA_GRE_PMTUDISC, 1653 NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1734 !!(p->iph.frag_off & htons(IP_DF)))) 1654
1735 goto nla_put_failure;
1736 return 0; 1655 return 0;
1737 1656
1738nla_put_failure: 1657nla_put_failure:
@@ -1786,7 +1705,7 @@ static int __init ipgre_init(void)
1786{ 1705{
1787 int err; 1706 int err;
1788 1707
1789 pr_info("GRE over IPv4 tunneling driver\n"); 1708 printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1790 1709
1791 err = register_pernet_device(&ipgre_net_ops); 1710 err = register_pernet_device(&ipgre_net_ops);
1792 if (err < 0) 1711 if (err < 0)
@@ -1794,7 +1713,7 @@ static int __init ipgre_init(void)
1794 1713
1795 err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO); 1714 err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1796 if (err < 0) { 1715 if (err < 0) {
1797 pr_info("%s: can't add protocol\n", __func__); 1716 printk(KERN_INFO "ipgre init: can't add protocol\n");
1798 goto add_proto_failed; 1717 goto add_proto_failed;
1799 } 1718 }
1800 1719
@@ -1823,7 +1742,7 @@ static void __exit ipgre_fini(void)
1823 rtnl_link_unregister(&ipgre_tap_ops); 1742 rtnl_link_unregister(&ipgre_tap_ops);
1824 rtnl_link_unregister(&ipgre_link_ops); 1743 rtnl_link_unregister(&ipgre_link_ops);
1825 if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) 1744 if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1826 pr_info("%s: can't remove protocol\n", __func__); 1745 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1827 unregister_pernet_device(&ipgre_net_ops); 1746 unregister_pernet_device(&ipgre_net_ops);
1828} 1747}
1829 1748
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index f1395a6fb35..073a9b01c40 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -113,8 +113,7 @@
113 * 2 of the License, or (at your option) any later version. 113 * 2 of the License, or (at your option) any later version.
114 */ 114 */
115 115
116#define pr_fmt(fmt) "IPv4: " fmt 116#include <asm/system.h>
117
118#include <linux/module.h> 117#include <linux/module.h>
119#include <linux/types.h> 118#include <linux/types.h>
120#include <linux/kernel.h> 119#include <linux/kernel.h>
@@ -149,7 +148,7 @@
149/* 148/*
150 * Process Router Attention IP option (RFC 2113) 149 * Process Router Attention IP option (RFC 2113)
151 */ 150 */
152bool ip_call_ra_chain(struct sk_buff *skb) 151int ip_call_ra_chain(struct sk_buff *skb)
153{ 152{
154 struct ip_ra_chain *ra; 153 struct ip_ra_chain *ra;
155 u8 protocol = ip_hdr(skb)->protocol; 154 u8 protocol = ip_hdr(skb)->protocol;
@@ -168,7 +167,7 @@ bool ip_call_ra_chain(struct sk_buff *skb)
168 net_eq(sock_net(sk), dev_net(dev))) { 167 net_eq(sock_net(sk), dev_net(dev))) {
169 if (ip_is_fragment(ip_hdr(skb))) { 168 if (ip_is_fragment(ip_hdr(skb))) {
170 if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) 169 if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN))
171 return true; 170 return 1;
172 } 171 }
173 if (last) { 172 if (last) {
174 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 173 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
@@ -181,9 +180,9 @@ bool ip_call_ra_chain(struct sk_buff *skb)
181 180
182 if (last) { 181 if (last) {
183 raw_rcv(last, skb); 182 raw_rcv(last, skb);
184 return true; 183 return 1;
185 } 184 }
186 return false; 185 return 0;
187} 186}
188 187
189static int ip_local_deliver_finish(struct sk_buff *skb) 188static int ip_local_deliver_finish(struct sk_buff *skb)
@@ -198,19 +197,21 @@ static int ip_local_deliver_finish(struct sk_buff *skb)
198 rcu_read_lock(); 197 rcu_read_lock();
199 { 198 {
200 int protocol = ip_hdr(skb)->protocol; 199 int protocol = ip_hdr(skb)->protocol;
200 int hash, raw;
201 const struct net_protocol *ipprot; 201 const struct net_protocol *ipprot;
202 int raw;
203 202
204 resubmit: 203 resubmit:
205 raw = raw_local_deliver(skb, protocol); 204 raw = raw_local_deliver(skb, protocol);
206 205
207 ipprot = rcu_dereference(inet_protos[protocol]); 206 hash = protocol & (MAX_INET_PROTOS - 1);
207 ipprot = rcu_dereference(inet_protos[hash]);
208 if (ipprot != NULL) { 208 if (ipprot != NULL) {
209 int ret; 209 int ret;
210 210
211 if (!net_eq(net, &init_net) && !ipprot->netns_ok) { 211 if (!net_eq(net, &init_net) && !ipprot->netns_ok) {
212 net_info_ratelimited("%s: proto %d isn't netns-ready\n", 212 if (net_ratelimit())
213 __func__, protocol); 213 printk("%s: proto %d isn't netns-ready\n",
214 __func__, protocol);
214 kfree_skb(skb); 215 kfree_skb(skb);
215 goto out; 216 goto out;
216 } 217 }
@@ -264,7 +265,7 @@ int ip_local_deliver(struct sk_buff *skb)
264 ip_local_deliver_finish); 265 ip_local_deliver_finish);
265} 266}
266 267
267static inline bool ip_rcv_options(struct sk_buff *skb) 268static inline int ip_rcv_options(struct sk_buff *skb)
268{ 269{
269 struct ip_options *opt; 270 struct ip_options *opt;
270 const struct iphdr *iph; 271 const struct iphdr *iph;
@@ -296,10 +297,10 @@ static inline bool ip_rcv_options(struct sk_buff *skb)
296 297
297 if (in_dev) { 298 if (in_dev) {
298 if (!IN_DEV_SOURCE_ROUTE(in_dev)) { 299 if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
299 if (IN_DEV_LOG_MARTIANS(in_dev)) 300 if (IN_DEV_LOG_MARTIANS(in_dev) &&
300 net_info_ratelimited("source route option %pI4 -> %pI4\n", 301 net_ratelimit())
301 &iph->saddr, 302 printk(KERN_INFO "source route option %pI4 -> %pI4\n",
302 &iph->daddr); 303 &iph->saddr, &iph->daddr);
303 goto drop; 304 goto drop;
304 } 305 }
305 } 306 }
@@ -308,40 +309,31 @@ static inline bool ip_rcv_options(struct sk_buff *skb)
308 goto drop; 309 goto drop;
309 } 310 }
310 311
311 return false; 312 return 0;
312drop: 313drop:
313 return true; 314 return -1;
314} 315}
315 316
316int sysctl_ip_early_demux __read_mostly = 1;
317EXPORT_SYMBOL(sysctl_ip_early_demux);
318
319static int ip_rcv_finish(struct sk_buff *skb) 317static int ip_rcv_finish(struct sk_buff *skb)
320{ 318{
321 const struct iphdr *iph = ip_hdr(skb); 319 const struct iphdr *iph = ip_hdr(skb);
322 struct rtable *rt; 320 struct rtable *rt;
323 321
324 if (sysctl_ip_early_demux && !skb_dst(skb)) {
325 const struct net_protocol *ipprot;
326 int protocol = iph->protocol;
327
328 ipprot = rcu_dereference(inet_protos[protocol]);
329 if (ipprot && ipprot->early_demux) {
330 ipprot->early_demux(skb);
331 /* must reload iph, skb->head might have changed */
332 iph = ip_hdr(skb);
333 }
334 }
335
336 /* 322 /*
337 * Initialise the virtual path cache for the packet. It describes 323 * Initialise the virtual path cache for the packet. It describes
338 * how the packet travels inside Linux networking. 324 * how the packet travels inside Linux networking.
339 */ 325 */
340 if (!skb_dst(skb)) { 326 if (skb_dst(skb) == NULL) {
341 int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, 327 int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
342 iph->tos, skb->dev); 328 iph->tos, skb->dev);
343 if (unlikely(err)) { 329 if (unlikely(err)) {
344 if (err == -EXDEV) 330 if (err == -EHOSTUNREACH)
331 IP_INC_STATS_BH(dev_net(skb->dev),
332 IPSTATS_MIB_INADDRERRORS);
333 else if (err == -ENETUNREACH)
334 IP_INC_STATS_BH(dev_net(skb->dev),
335 IPSTATS_MIB_INNOROUTES);
336 else if (err == -EXDEV)
345 NET_INC_STATS_BH(dev_net(skb->dev), 337 NET_INC_STATS_BH(dev_net(skb->dev),
346 LINUX_MIB_IPRPFILTER); 338 LINUX_MIB_IPRPFILTER);
347 goto drop; 339 goto drop;
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index f6289bf6f33..05d20cca9d6 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -9,8 +9,6 @@
9 * 9 *
10 */ 10 */
11 11
12#define pr_fmt(fmt) "IPv4: " fmt
13
14#include <linux/capability.h> 12#include <linux/capability.h>
15#include <linux/module.h> 13#include <linux/module.h>
16#include <linux/slab.h> 14#include <linux/slab.h>
@@ -27,7 +25,6 @@
27#include <net/icmp.h> 25#include <net/icmp.h>
28#include <net/route.h> 26#include <net/route.h>
29#include <net/cipso_ipv4.h> 27#include <net/cipso_ipv4.h>
30#include <net/ip_fib.h>
31 28
32/* 29/*
33 * Write options to IP header, record destination address to 30 * Write options to IP header, record destination address to
@@ -93,6 +90,7 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
93 unsigned char *sptr, *dptr; 90 unsigned char *sptr, *dptr;
94 int soffset, doffset; 91 int soffset, doffset;
95 int optlen; 92 int optlen;
93 __be32 daddr;
96 94
97 memset(dopt, 0, sizeof(struct ip_options)); 95 memset(dopt, 0, sizeof(struct ip_options));
98 96
@@ -104,6 +102,8 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
104 sptr = skb_network_header(skb); 102 sptr = skb_network_header(skb);
105 dptr = dopt->__data; 103 dptr = dopt->__data;
106 104
105 daddr = skb_rtable(skb)->rt_spec_dst;
106
107 if (sopt->rr) { 107 if (sopt->rr) {
108 optlen = sptr[sopt->rr+1]; 108 optlen = sptr[sopt->rr+1];
109 soffset = sptr[sopt->rr+2]; 109 soffset = sptr[sopt->rr+2];
@@ -177,8 +177,6 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
177 doffset -= 4; 177 doffset -= 4;
178 } 178 }
179 if (doffset > 3) { 179 if (doffset > 3) {
180 __be32 daddr = fib_compute_spec_dst(skb);
181
182 memcpy(&start[doffset-1], &daddr, 4); 180 memcpy(&start[doffset-1], &daddr, 4);
183 dopt->faddr = faddr; 181 dopt->faddr = faddr;
184 dptr[0] = start[0]; 182 dptr[0] = start[0];
@@ -210,10 +208,10 @@ int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
210 * Simple and stupid 8), but the most efficient way. 208 * Simple and stupid 8), but the most efficient way.
211 */ 209 */
212 210
213void ip_options_fragment(struct sk_buff *skb) 211void ip_options_fragment(struct sk_buff * skb)
214{ 212{
215 unsigned char *optptr = skb_network_header(skb) + sizeof(struct iphdr); 213 unsigned char *optptr = skb_network_header(skb) + sizeof(struct iphdr);
216 struct ip_options *opt = &(IPCB(skb)->opt); 214 struct ip_options * opt = &(IPCB(skb)->opt);
217 int l = opt->optlen; 215 int l = opt->optlen;
218 int optlen; 216 int optlen;
219 217
@@ -241,15 +239,6 @@ void ip_options_fragment(struct sk_buff *skb)
241 opt->ts_needtime = 0; 239 opt->ts_needtime = 0;
242} 240}
243 241
244/* helper used by ip_options_compile() to call fib_compute_spec_dst()
245 * at most one time.
246 */
247static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb)
248{
249 if (*spec_dst == htonl(INADDR_ANY))
250 *spec_dst = fib_compute_spec_dst(skb);
251}
252
253/* 242/*
254 * Verify options and fill pointers in struct options. 243 * Verify options and fill pointers in struct options.
255 * Caller should clear *opt, and set opt->data. 244 * Caller should clear *opt, and set opt->data.
@@ -257,14 +246,14 @@ static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb)
257 */ 246 */
258 247
259int ip_options_compile(struct net *net, 248int ip_options_compile(struct net *net,
260 struct ip_options *opt, struct sk_buff *skb) 249 struct ip_options * opt, struct sk_buff * skb)
261{ 250{
262 __be32 spec_dst = htonl(INADDR_ANY); 251 int l;
263 unsigned char *pp_ptr = NULL; 252 unsigned char * iph;
253 unsigned char * optptr;
254 int optlen;
255 unsigned char * pp_ptr = NULL;
264 struct rtable *rt = NULL; 256 struct rtable *rt = NULL;
265 unsigned char *optptr;
266 unsigned char *iph;
267 int optlen, l;
268 257
269 if (skb != NULL) { 258 if (skb != NULL) {
270 rt = skb_rtable(skb); 259 rt = skb_rtable(skb);
@@ -340,8 +329,7 @@ int ip_options_compile(struct net *net,
340 goto error; 329 goto error;
341 } 330 }
342 if (rt) { 331 if (rt) {
343 spec_dst_fill(&spec_dst, skb); 332 memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
344 memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
345 opt->is_changed = 1; 333 opt->is_changed = 1;
346 } 334 }
347 optptr[2] += 4; 335 optptr[2] += 4;
@@ -383,8 +371,7 @@ int ip_options_compile(struct net *net,
383 } 371 }
384 opt->ts = optptr - iph; 372 opt->ts = optptr - iph;
385 if (rt) { 373 if (rt) {
386 spec_dst_fill(&spec_dst, skb); 374 memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
387 memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
388 timeptr = &optptr[optptr[2]+3]; 375 timeptr = &optptr[optptr[2]+3];
389 } 376 }
390 opt->ts_needaddr = 1; 377 opt->ts_needaddr = 1;
@@ -409,7 +396,7 @@ int ip_options_compile(struct net *net,
409 optptr[2] += 8; 396 optptr[2] += 8;
410 break; 397 break;
411 default: 398 default:
412 if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) { 399 if (!skb && !capable(CAP_NET_RAW)) {
413 pp_ptr = optptr + 3; 400 pp_ptr = optptr + 3;
414 goto error; 401 goto error;
415 } 402 }
@@ -424,7 +411,7 @@ int ip_options_compile(struct net *net,
424 opt->is_changed = 1; 411 opt->is_changed = 1;
425 } 412 }
426 } else { 413 } else {
427 unsigned int overflow = optptr[3]>>4; 414 unsigned overflow = optptr[3]>>4;
428 if (overflow == 15) { 415 if (overflow == 15) {
429 pp_ptr = optptr + 3; 416 pp_ptr = optptr + 3;
430 goto error; 417 goto error;
@@ -445,7 +432,7 @@ int ip_options_compile(struct net *net,
445 opt->router_alert = optptr - iph; 432 opt->router_alert = optptr - iph;
446 break; 433 break;
447 case IPOPT_CIPSO: 434 case IPOPT_CIPSO:
448 if ((!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) || opt->cipso) { 435 if ((!skb && !capable(CAP_NET_RAW)) || opt->cipso) {
449 pp_ptr = optptr; 436 pp_ptr = optptr;
450 goto error; 437 goto error;
451 } 438 }
@@ -458,7 +445,7 @@ int ip_options_compile(struct net *net,
458 case IPOPT_SEC: 445 case IPOPT_SEC:
459 case IPOPT_SID: 446 case IPOPT_SID:
460 default: 447 default:
461 if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) { 448 if (!skb && !capable(CAP_NET_RAW)) {
462 pp_ptr = optptr; 449 pp_ptr = optptr;
463 goto error; 450 goto error;
464 } 451 }
@@ -484,20 +471,20 @@ EXPORT_SYMBOL(ip_options_compile);
484 * Undo all the changes done by ip_options_compile(). 471 * Undo all the changes done by ip_options_compile().
485 */ 472 */
486 473
487void ip_options_undo(struct ip_options *opt) 474void ip_options_undo(struct ip_options * opt)
488{ 475{
489 if (opt->srr) { 476 if (opt->srr) {
490 unsigned char *optptr = opt->__data+opt->srr-sizeof(struct iphdr); 477 unsigned char * optptr = opt->__data+opt->srr-sizeof(struct iphdr);
491 memmove(optptr+7, optptr+3, optptr[1]-7); 478 memmove(optptr+7, optptr+3, optptr[1]-7);
492 memcpy(optptr+3, &opt->faddr, 4); 479 memcpy(optptr+3, &opt->faddr, 4);
493 } 480 }
494 if (opt->rr_needaddr) { 481 if (opt->rr_needaddr) {
495 unsigned char *optptr = opt->__data+opt->rr-sizeof(struct iphdr); 482 unsigned char * optptr = opt->__data+opt->rr-sizeof(struct iphdr);
496 optptr[2] -= 4; 483 optptr[2] -= 4;
497 memset(&optptr[optptr[2]-1], 0, 4); 484 memset(&optptr[optptr[2]-1], 0, 4);
498 } 485 }
499 if (opt->ts) { 486 if (opt->ts) {
500 unsigned char *optptr = opt->__data+opt->ts-sizeof(struct iphdr); 487 unsigned char * optptr = opt->__data+opt->ts-sizeof(struct iphdr);
501 if (opt->ts_needtime) { 488 if (opt->ts_needtime) {
502 optptr[2] -= 4; 489 optptr[2] -= 4;
503 memset(&optptr[optptr[2]-1], 0, 4); 490 memset(&optptr[optptr[2]-1], 0, 4);
@@ -560,8 +547,8 @@ int ip_options_get(struct net *net, struct ip_options_rcu **optp,
560 547
561void ip_forward_options(struct sk_buff *skb) 548void ip_forward_options(struct sk_buff *skb)
562{ 549{
563 struct ip_options *opt = &(IPCB(skb)->opt); 550 struct ip_options * opt = &(IPCB(skb)->opt);
564 unsigned char *optptr; 551 unsigned char * optptr;
565 struct rtable *rt = skb_rtable(skb); 552 struct rtable *rt = skb_rtable(skb);
566 unsigned char *raw = skb_network_header(skb); 553 unsigned char *raw = skb_network_header(skb);
567 554
@@ -581,18 +568,15 @@ void ip_forward_options(struct sk_buff *skb)
581 ) { 568 ) {
582 if (srrptr + 3 > srrspace) 569 if (srrptr + 3 > srrspace)
583 break; 570 break;
584 if (memcmp(&opt->nexthop, &optptr[srrptr-1], 4) == 0) 571 if (memcmp(&ip_hdr(skb)->daddr, &optptr[srrptr-1], 4) == 0)
585 break; 572 break;
586 } 573 }
587 if (srrptr + 3 <= srrspace) { 574 if (srrptr + 3 <= srrspace) {
588 opt->is_changed = 1; 575 opt->is_changed = 1;
589 ip_hdr(skb)->daddr = opt->nexthop;
590 ip_rt_get_source(&optptr[srrptr-1], skb, rt); 576 ip_rt_get_source(&optptr[srrptr-1], skb, rt);
591 optptr[2] = srrptr+4; 577 optptr[2] = srrptr+4;
592 } else { 578 } else if (net_ratelimit())
593 net_crit_ratelimited("%s(): Argh! Destination lost!\n", 579 printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n");
594 __func__);
595 }
596 if (opt->ts_needaddr) { 580 if (opt->ts_needaddr) {
597 optptr = raw + opt->ts; 581 optptr = raw + opt->ts;
598 ip_rt_get_source(&optptr[optptr[2]-9], skb, rt); 582 ip_rt_get_source(&optptr[optptr[2]-9], skb, rt);
@@ -656,7 +640,7 @@ int ip_options_rcv_srr(struct sk_buff *skb)
656 } 640 }
657 if (srrptr <= srrspace) { 641 if (srrptr <= srrspace) {
658 opt->srr_is_hit = 1; 642 opt->srr_is_hit = 1;
659 opt->nexthop = nexthop; 643 iph->daddr = nexthop;
660 opt->is_changed = 1; 644 opt->is_changed = 1;
661 } 645 }
662 return 0; 646 return 0;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 3e98ed2bff5..8c6563361ab 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -43,6 +43,7 @@
43 */ 43 */
44 44
45#include <asm/uaccess.h> 45#include <asm/uaccess.h>
46#include <asm/system.h>
46#include <linux/module.h> 47#include <linux/module.h>
47#include <linux/types.h> 48#include <linux/types.h>
48#include <linux/kernel.h> 49#include <linux/kernel.h>
@@ -113,6 +114,19 @@ int ip_local_out(struct sk_buff *skb)
113} 114}
114EXPORT_SYMBOL_GPL(ip_local_out); 115EXPORT_SYMBOL_GPL(ip_local_out);
115 116
117/* dev_loopback_xmit for use with netfilter. */
118static int ip_dev_loopback_xmit(struct sk_buff *newskb)
119{
120 skb_reset_mac_header(newskb);
121 __skb_pull(newskb, skb_network_offset(newskb));
122 newskb->pkt_type = PACKET_LOOPBACK;
123 newskb->ip_summed = CHECKSUM_UNNECESSARY;
124 WARN_ON(!skb_dst(newskb));
125 skb_dst_force(newskb);
126 netif_rx_ni(newskb);
127 return 0;
128}
129
116static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) 130static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
117{ 131{
118 int ttl = inet->uc_ttl; 132 int ttl = inet->uc_ttl;
@@ -170,7 +184,6 @@ static inline int ip_finish_output2(struct sk_buff *skb)
170 struct net_device *dev = dst->dev; 184 struct net_device *dev = dst->dev;
171 unsigned int hh_len = LL_RESERVED_SPACE(dev); 185 unsigned int hh_len = LL_RESERVED_SPACE(dev);
172 struct neighbour *neigh; 186 struct neighbour *neigh;
173 u32 nexthop;
174 187
175 if (rt->rt_type == RTN_MULTICAST) { 188 if (rt->rt_type == RTN_MULTICAST) {
176 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len); 189 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
@@ -188,25 +201,22 @@ static inline int ip_finish_output2(struct sk_buff *skb)
188 } 201 }
189 if (skb->sk) 202 if (skb->sk)
190 skb_set_owner_w(skb2, skb->sk); 203 skb_set_owner_w(skb2, skb->sk);
191 consume_skb(skb); 204 kfree_skb(skb);
192 skb = skb2; 205 skb = skb2;
193 } 206 }
194 207
195 rcu_read_lock_bh(); 208 rcu_read_lock();
196 nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr); 209 neigh = dst_get_neighbour(dst);
197 neigh = __ipv4_neigh_lookup_noref(dev, nexthop); 210 if (neigh) {
198 if (unlikely(!neigh)) 211 int res = neigh_output(neigh, skb);
199 neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
200 if (!IS_ERR(neigh)) {
201 int res = dst_neigh_output(dst, neigh, skb);
202 212
203 rcu_read_unlock_bh(); 213 rcu_read_unlock();
204 return res; 214 return res;
205 } 215 }
206 rcu_read_unlock_bh(); 216 rcu_read_unlock();
207 217
208 net_dbg_ratelimited("%s: No header cache and no neighbour!\n", 218 if (net_ratelimit())
209 __func__); 219 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
210 kfree_skb(skb); 220 kfree_skb(skb);
211 return -EINVAL; 221 return -EINVAL;
212} 222}
@@ -272,7 +282,7 @@ int ip_mc_output(struct sk_buff *skb)
272 if (newskb) 282 if (newskb)
273 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, 283 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
274 newskb, NULL, newskb->dev, 284 newskb, NULL, newskb->dev,
275 dev_loopback_xmit); 285 ip_dev_loopback_xmit);
276 } 286 }
277 287
278 /* Multicasts with ttl 0 must not go beyond the host */ 288 /* Multicasts with ttl 0 must not go beyond the host */
@@ -287,7 +297,7 @@ int ip_mc_output(struct sk_buff *skb)
287 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 297 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
288 if (newskb) 298 if (newskb)
289 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb, 299 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
290 NULL, newskb->dev, dev_loopback_xmit); 300 NULL, newskb->dev, ip_dev_loopback_xmit);
291 } 301 }
292 302
293 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, 303 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
@@ -309,20 +319,6 @@ int ip_output(struct sk_buff *skb)
309 !(IPCB(skb)->flags & IPSKB_REROUTED)); 319 !(IPCB(skb)->flags & IPSKB_REROUTED));
310} 320}
311 321
312/*
313 * copy saddr and daddr, possibly using 64bit load/stores
314 * Equivalent to :
315 * iph->saddr = fl4->saddr;
316 * iph->daddr = fl4->daddr;
317 */
318static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
319{
320 BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
321 offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
322 memcpy(&iph->saddr, &fl4->saddr,
323 sizeof(fl4->saddr) + sizeof(fl4->daddr));
324}
325
326int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl) 322int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
327{ 323{
328 struct sock *sk = skb->sk; 324 struct sock *sk = skb->sk;
@@ -371,7 +367,7 @@ int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
371 skb_dst_set_noref(skb, &rt->dst); 367 skb_dst_set_noref(skb, &rt->dst);
372 368
373packet_routed: 369packet_routed:
374 if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway) 370 if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
375 goto no_route; 371 goto no_route;
376 372
377 /* OK, we know where to send it, allocate and build IP header. */ 373 /* OK, we know where to send it, allocate and build IP header. */
@@ -385,8 +381,8 @@ packet_routed:
385 iph->frag_off = 0; 381 iph->frag_off = 0;
386 iph->ttl = ip_select_ttl(inet, &rt->dst); 382 iph->ttl = ip_select_ttl(inet, &rt->dst);
387 iph->protocol = sk->sk_protocol; 383 iph->protocol = sk->sk_protocol;
388 ip_copy_addrs(iph, fl4); 384 iph->saddr = fl4->saddr;
389 385 iph->daddr = fl4->daddr;
390 /* Transport layer set skb->h.foo itself. */ 386 /* Transport layer set skb->h.foo itself. */
391 387
392 if (inet_opt && inet_opt->opt.optlen) { 388 if (inet_opt && inet_opt->opt.optlen) {
@@ -467,9 +463,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
467 463
468 iph = ip_hdr(skb); 464 iph = ip_hdr(skb);
469 465
470 if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->local_df) || 466 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
471 (IPCB(skb)->frag_max_size &&
472 IPCB(skb)->frag_max_size > dst_mtu(&rt->dst)))) {
473 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); 467 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
474 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, 468 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
475 htonl(ip_skb_dst_mtu(skb))); 469 htonl(ip_skb_dst_mtu(skb)));
@@ -595,10 +589,6 @@ slow_path_clean:
595 } 589 }
596 590
597slow_path: 591slow_path:
598 /* for offloaded checksums cleanup checksum before fragmentation */
599 if ((skb->ip_summed == CHECKSUM_PARTIAL) && skb_checksum_help(skb))
600 goto fail;
601
602 left = skb->len - hlen; /* Space per frame */ 592 left = skb->len - hlen; /* Space per frame */
603 ptr = hlen; /* Where to start from */ 593 ptr = hlen; /* Where to start from */
604 594
@@ -706,7 +696,7 @@ slow_path:
706 696
707 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); 697 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
708 } 698 }
709 consume_skb(skb); 699 kfree_skb(skb);
710 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); 700 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
711 return err; 701 return err;
712 702
@@ -797,7 +787,6 @@ static int __ip_append_data(struct sock *sk,
797 struct flowi4 *fl4, 787 struct flowi4 *fl4,
798 struct sk_buff_head *queue, 788 struct sk_buff_head *queue,
799 struct inet_cork *cork, 789 struct inet_cork *cork,
800 struct page_frag *pfrag,
801 int getfrag(void *from, char *to, int offset, 790 int getfrag(void *from, char *to, int offset,
802 int len, int odd, struct sk_buff *skb), 791 int len, int odd, struct sk_buff *skb),
803 void *from, int length, int transhdrlen, 792 void *from, int length, int transhdrlen,
@@ -992,30 +981,46 @@ alloc_new_skb:
992 } 981 }
993 } else { 982 } else {
994 int i = skb_shinfo(skb)->nr_frags; 983 int i = skb_shinfo(skb)->nr_frags;
995 984 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
996 err = -ENOMEM; 985 struct page *page = cork->page;
997 if (!sk_page_frag_refill(sk, pfrag)) 986 int off = cork->off;
998 goto error; 987 unsigned int left;
999 988
1000 if (!skb_can_coalesce(skb, i, pfrag->page, 989 if (page && (left = PAGE_SIZE - off) > 0) {
1001 pfrag->offset)) { 990 if (copy >= left)
1002 err = -EMSGSIZE; 991 copy = left;
1003 if (i == MAX_SKB_FRAGS) 992 if (page != frag->page) {
993 if (i == MAX_SKB_FRAGS) {
994 err = -EMSGSIZE;
995 goto error;
996 }
997 get_page(page);
998 skb_fill_page_desc(skb, i, page, off, 0);
999 frag = &skb_shinfo(skb)->frags[i];
1000 }
1001 } else if (i < MAX_SKB_FRAGS) {
1002 if (copy > PAGE_SIZE)
1003 copy = PAGE_SIZE;
1004 page = alloc_pages(sk->sk_allocation, 0);
1005 if (page == NULL) {
1006 err = -ENOMEM;
1004 goto error; 1007 goto error;
1008 }
1009 cork->page = page;
1010 cork->off = 0;
1005 1011
1006 __skb_fill_page_desc(skb, i, pfrag->page, 1012 skb_fill_page_desc(skb, i, page, 0, 0);
1007 pfrag->offset, 0); 1013 frag = &skb_shinfo(skb)->frags[i];
1008 skb_shinfo(skb)->nr_frags = ++i; 1014 } else {
1009 get_page(pfrag->page); 1015 err = -EMSGSIZE;
1016 goto error;
1010 } 1017 }
1011 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1018 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1012 if (getfrag(from, 1019 err = -EFAULT;
1013 page_address(pfrag->page) + pfrag->offset, 1020 goto error;
1014 offset, copy, skb->len, skb) < 0) 1021 }
1015 goto error_efault; 1022 cork->off += copy;
1016 1023 frag->size += copy;
1017 pfrag->offset += copy;
1018 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1019 skb->len += copy; 1024 skb->len += copy;
1020 skb->data_len += copy; 1025 skb->data_len += copy;
1021 skb->truesize += copy; 1026 skb->truesize += copy;
@@ -1027,8 +1032,6 @@ alloc_new_skb:
1027 1032
1028 return 0; 1033 return 0;
1029 1034
1030error_efault:
1031 err = -EFAULT;
1032error: 1035error:
1033 cork->length -= length; 1036 cork->length -= length;
1034 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); 1037 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
@@ -1069,6 +1072,8 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1069 cork->dst = &rt->dst; 1072 cork->dst = &rt->dst;
1070 cork->length = 0; 1073 cork->length = 0;
1071 cork->tx_flags = ipc->tx_flags; 1074 cork->tx_flags = ipc->tx_flags;
1075 cork->page = NULL;
1076 cork->off = 0;
1072 1077
1073 return 0; 1078 return 0;
1074} 1079}
@@ -1105,8 +1110,7 @@ int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1105 transhdrlen = 0; 1110 transhdrlen = 0;
1106 } 1111 }
1107 1112
1108 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, 1113 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
1109 sk_page_frag(sk), getfrag,
1110 from, length, transhdrlen, flags); 1114 from, length, transhdrlen, flags);
1111} 1115}
1112 1116
@@ -1225,7 +1229,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1225 if (len > size) 1229 if (len > size)
1226 len = size; 1230 len = size;
1227 if (skb_can_coalesce(skb, i, page, offset)) { 1231 if (skb_can_coalesce(skb, i, page, offset)) {
1228 skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len); 1232 skb_shinfo(skb)->frags[i-1].size += len;
1229 } else if (i < MAX_SKB_FRAGS) { 1233 } else if (i < MAX_SKB_FRAGS) {
1230 get_page(page); 1234 get_page(page);
1231 skb_fill_page_desc(skb, i, page, offset, len); 1235 skb_fill_page_desc(skb, i, page, offset, len);
@@ -1329,10 +1333,11 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
1329 iph->ihl = 5; 1333 iph->ihl = 5;
1330 iph->tos = inet->tos; 1334 iph->tos = inet->tos;
1331 iph->frag_off = df; 1335 iph->frag_off = df;
1336 ip_select_ident(iph, &rt->dst, sk);
1332 iph->ttl = ttl; 1337 iph->ttl = ttl;
1333 iph->protocol = sk->sk_protocol; 1338 iph->protocol = sk->sk_protocol;
1334 ip_copy_addrs(iph, fl4); 1339 iph->saddr = fl4->saddr;
1335 ip_select_ident(iph, &rt->dst, sk); 1340 iph->daddr = fl4->daddr;
1336 1341
1337 if (opt) { 1342 if (opt) {
1338 iph->ihl += opt->optlen>>2; 1343 iph->ihl += opt->optlen>>2;
@@ -1357,8 +1362,9 @@ out:
1357 return skb; 1362 return skb;
1358} 1363}
1359 1364
1360int ip_send_skb(struct net *net, struct sk_buff *skb) 1365int ip_send_skb(struct sk_buff *skb)
1361{ 1366{
1367 struct net *net = sock_net(skb->sk);
1362 int err; 1368 int err;
1363 1369
1364 err = ip_local_out(skb); 1370 err = ip_local_out(skb);
@@ -1381,7 +1387,7 @@ int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1381 return 0; 1387 return 0;
1382 1388
1383 /* Netfilter gets whole the not fragmented skb. */ 1389 /* Netfilter gets whole the not fragmented skb. */
1384 return ip_send_skb(sock_net(sk), skb); 1390 return ip_send_skb(skb);
1385} 1391}
1386 1392
1387/* 1393/*
@@ -1428,8 +1434,7 @@ struct sk_buff *ip_make_skb(struct sock *sk,
1428 if (err) 1434 if (err)
1429 return ERR_PTR(err); 1435 return ERR_PTR(err);
1430 1436
1431 err = __ip_append_data(sk, fl4, &queue, &cork, 1437 err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
1432 &current->task_frag, getfrag,
1433 from, length, transhdrlen, flags); 1438 from, length, transhdrlen, flags);
1434 if (err) { 1439 if (err) {
1435 __ip_flush_pending_frames(sk, &queue, &cork); 1440 __ip_flush_pending_frames(sk, &queue, &cork);
@@ -1454,34 +1459,19 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1454 1459
1455/* 1460/*
1456 * Generic function to send a packet as reply to another packet. 1461 * Generic function to send a packet as reply to another packet.
1457 * Used to send some TCP resets/acks so far. 1462 * Used to send TCP resets so far. ICMP should use this function too.
1458 * 1463 *
1459 * Use a fake percpu inet socket to avoid false sharing and contention. 1464 * Should run single threaded per socket because it uses the sock
1465 * structure to pass arguments.
1460 */ 1466 */
1461static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = { 1467void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1462 .sk = { 1468 struct ip_reply_arg *arg, unsigned int len)
1463 .__sk_common = {
1464 .skc_refcnt = ATOMIC_INIT(1),
1465 },
1466 .sk_wmem_alloc = ATOMIC_INIT(1),
1467 .sk_allocation = GFP_ATOMIC,
1468 .sk_flags = (1UL << SOCK_USE_WRITE_QUEUE),
1469 },
1470 .pmtudisc = IP_PMTUDISC_WANT,
1471 .uc_ttl = -1,
1472};
1473
1474void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
1475 __be32 saddr, const struct ip_reply_arg *arg,
1476 unsigned int len)
1477{ 1469{
1470 struct inet_sock *inet = inet_sk(sk);
1478 struct ip_options_data replyopts; 1471 struct ip_options_data replyopts;
1479 struct ipcm_cookie ipc; 1472 struct ipcm_cookie ipc;
1480 struct flowi4 fl4; 1473 struct flowi4 fl4;
1481 struct rtable *rt = skb_rtable(skb); 1474 struct rtable *rt = skb_rtable(skb);
1482 struct sk_buff *nskb;
1483 struct sock *sk;
1484 struct inet_sock *inet;
1485 1475
1486 if (ip_options_echo(&replyopts.opt.opt, skb)) 1476 if (ip_options_echo(&replyopts.opt.opt, skb))
1487 return; 1477 return;
@@ -1498,41 +1488,39 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
1498 } 1488 }
1499 1489
1500 flowi4_init_output(&fl4, arg->bound_dev_if, 0, 1490 flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1501 RT_TOS(arg->tos), 1491 RT_TOS(ip_hdr(skb)->tos),
1502 RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, 1492 RT_SCOPE_UNIVERSE, sk->sk_protocol,
1503 ip_reply_arg_flowi_flags(arg), 1493 ip_reply_arg_flowi_flags(arg),
1504 daddr, saddr, 1494 daddr, rt->rt_spec_dst,
1505 tcp_hdr(skb)->source, tcp_hdr(skb)->dest); 1495 tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1506 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); 1496 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1507 rt = ip_route_output_key(net, &fl4); 1497 rt = ip_route_output_key(sock_net(sk), &fl4);
1508 if (IS_ERR(rt)) 1498 if (IS_ERR(rt))
1509 return; 1499 return;
1510 1500
1511 inet = &get_cpu_var(unicast_sock); 1501 /* And let IP do all the hard work.
1512 1502
1513 inet->tos = arg->tos; 1503 This chunk is not reenterable, hence spinlock.
1514 sk = &inet->sk; 1504 Note that it uses the fact, that this function is called
1505 with locally disabled BH and that sk cannot be already spinlocked.
1506 */
1507 bh_lock_sock(sk);
1508 inet->tos = ip_hdr(skb)->tos;
1515 sk->sk_priority = skb->priority; 1509 sk->sk_priority = skb->priority;
1516 sk->sk_protocol = ip_hdr(skb)->protocol; 1510 sk->sk_protocol = ip_hdr(skb)->protocol;
1517 sk->sk_bound_dev_if = arg->bound_dev_if; 1511 sk->sk_bound_dev_if = arg->bound_dev_if;
1518 sock_net_set(sk, net);
1519 __skb_queue_head_init(&sk->sk_write_queue);
1520 sk->sk_sndbuf = sysctl_wmem_default;
1521 ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, 1512 ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1522 &ipc, &rt, MSG_DONTWAIT); 1513 &ipc, &rt, MSG_DONTWAIT);
1523 nskb = skb_peek(&sk->sk_write_queue); 1514 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1524 if (nskb) {
1525 if (arg->csumoffset >= 0) 1515 if (arg->csumoffset >= 0)
1526 *((__sum16 *)skb_transport_header(nskb) + 1516 *((__sum16 *)skb_transport_header(skb) +
1527 arg->csumoffset) = csum_fold(csum_add(nskb->csum, 1517 arg->csumoffset) = csum_fold(csum_add(skb->csum,
1528 arg->csum)); 1518 arg->csum));
1529 nskb->ip_summed = CHECKSUM_NONE; 1519 skb->ip_summed = CHECKSUM_NONE;
1530 skb_orphan(nskb);
1531 skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
1532 ip_push_pending_frames(sk, &fl4); 1520 ip_push_pending_frames(sk, &fl4);
1533 } 1521 }
1534 1522
1535 put_cpu_var(unicast_sock); 1523 bh_unlock_sock(sk);
1536 1524
1537 ip_rt_put(rt); 1525 ip_rt_put(rt);
1538} 1526}
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index d9c4f113d70..8905e92f896 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -33,14 +33,12 @@
33#include <linux/netfilter.h> 33#include <linux/netfilter.h>
34#include <linux/route.h> 34#include <linux/route.h>
35#include <linux/mroute.h> 35#include <linux/mroute.h>
36#include <net/inet_ecn.h>
37#include <net/route.h> 36#include <net/route.h>
38#include <net/xfrm.h> 37#include <net/xfrm.h>
39#include <net/compat.h> 38#include <net/compat.h>
40#if IS_ENABLED(CONFIG_IPV6) 39#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
41#include <net/transp_v6.h> 40#include <net/transp_v6.h>
42#endif 41#endif
43#include <net/ip_fib.h>
44 42
45#include <linux/errqueue.h> 43#include <linux/errqueue.h>
46#include <asm/uaccess.h> 44#include <asm/uaccess.h>
@@ -56,13 +54,20 @@
56/* 54/*
57 * SOL_IP control messages. 55 * SOL_IP control messages.
58 */ 56 */
59#define PKTINFO_SKB_CB(__skb) ((struct in_pktinfo *)((__skb)->cb))
60 57
61static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) 58static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
62{ 59{
63 struct in_pktinfo info = *PKTINFO_SKB_CB(skb); 60 struct in_pktinfo info;
61 struct rtable *rt = skb_rtable(skb);
64 62
65 info.ipi_addr.s_addr = ip_hdr(skb)->daddr; 63 info.ipi_addr.s_addr = ip_hdr(skb)->daddr;
64 if (rt) {
65 info.ipi_ifindex = rt->rt_iif;
66 info.ipi_spec_dst.s_addr = rt->rt_spec_dst;
67 } else {
68 info.ipi_ifindex = 0;
69 info.ipi_spec_dst.s_addr = 0;
70 }
66 71
67 put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); 72 put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
68} 73}
@@ -91,7 +96,7 @@ static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb)
91static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb) 96static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
92{ 97{
93 unsigned char optbuf[sizeof(struct ip_options) + 40]; 98 unsigned char optbuf[sizeof(struct ip_options) + 40];
94 struct ip_options *opt = (struct ip_options *)optbuf; 99 struct ip_options * opt = (struct ip_options *)optbuf;
95 100
96 if (IPCB(skb)->opt.optlen == 0) 101 if (IPCB(skb)->opt.optlen == 0)
97 return; 102 return;
@@ -148,7 +153,7 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
148void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) 153void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
149{ 154{
150 struct inet_sock *inet = inet_sk(skb->sk); 155 struct inet_sock *inet = inet_sk(skb->sk);
151 unsigned int flags = inet->cmsg_flags; 156 unsigned flags = inet->cmsg_flags;
152 157
153 /* Ordered by supposed usage frequency */ 158 /* Ordered by supposed usage frequency */
154 if (flags & 1) 159 if (flags & 1)
@@ -446,6 +451,11 @@ out:
446} 451}
447 452
448 453
454static void opt_kfree_rcu(struct rcu_head *head)
455{
456 kfree(container_of(head, struct ip_options_rcu, rcu));
457}
458
449/* 459/*
450 * Socket option code for IP. This is the end of the line after any 460 * Socket option code for IP. This is the end of the line after any
451 * TCP,UDP etc options on an IP socket. 461 * TCP,UDP etc options on an IP socket.
@@ -457,28 +467,18 @@ static int do_ip_setsockopt(struct sock *sk, int level,
457 struct inet_sock *inet = inet_sk(sk); 467 struct inet_sock *inet = inet_sk(sk);
458 int val = 0, err; 468 int val = 0, err;
459 469
460 switch (optname) { 470 if (((1<<optname) & ((1<<IP_PKTINFO) | (1<<IP_RECVTTL) |
461 case IP_PKTINFO: 471 (1<<IP_RECVOPTS) | (1<<IP_RECVTOS) |
462 case IP_RECVTTL: 472 (1<<IP_RETOPTS) | (1<<IP_TOS) |
463 case IP_RECVOPTS: 473 (1<<IP_TTL) | (1<<IP_HDRINCL) |
464 case IP_RECVTOS: 474 (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) |
465 case IP_RETOPTS: 475 (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) |
466 case IP_TOS: 476 (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT) |
467 case IP_TTL: 477 (1<<IP_MINTTL) | (1<<IP_NODEFRAG))) ||
468 case IP_HDRINCL: 478 optname == IP_MULTICAST_TTL ||
469 case IP_MTU_DISCOVER: 479 optname == IP_MULTICAST_ALL ||
470 case IP_RECVERR: 480 optname == IP_MULTICAST_LOOP ||
471 case IP_ROUTER_ALERT: 481 optname == IP_RECVORIGDSTADDR) {
472 case IP_FREEBIND:
473 case IP_PASSSEC:
474 case IP_TRANSPARENT:
475 case IP_MINTTL:
476 case IP_NODEFRAG:
477 case IP_UNICAST_IF:
478 case IP_MULTICAST_TTL:
479 case IP_MULTICAST_ALL:
480 case IP_MULTICAST_LOOP:
481 case IP_RECVORIGDSTADDR:
482 if (optlen >= sizeof(int)) { 482 if (optlen >= sizeof(int)) {
483 if (get_user(val, (int __user *) optval)) 483 if (get_user(val, (int __user *) optval))
484 return -EFAULT; 484 return -EFAULT;
@@ -514,7 +514,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
514 sock_owned_by_user(sk)); 514 sock_owned_by_user(sk));
515 if (inet->is_icsk) { 515 if (inet->is_icsk) {
516 struct inet_connection_sock *icsk = inet_csk(sk); 516 struct inet_connection_sock *icsk = inet_csk(sk);
517#if IS_ENABLED(CONFIG_IPV6) 517#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
518 if (sk->sk_family == PF_INET || 518 if (sk->sk_family == PF_INET ||
519 (!((1 << sk->sk_state) & 519 (!((1 << sk->sk_state) &
520 (TCPF_LISTEN | TCPF_CLOSE)) && 520 (TCPF_LISTEN | TCPF_CLOSE)) &&
@@ -525,13 +525,13 @@ static int do_ip_setsockopt(struct sock *sk, int level,
525 if (opt) 525 if (opt)
526 icsk->icsk_ext_hdr_len += opt->opt.optlen; 526 icsk->icsk_ext_hdr_len += opt->opt.optlen;
527 icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); 527 icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
528#if IS_ENABLED(CONFIG_IPV6) 528#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
529 } 529 }
530#endif 530#endif
531 } 531 }
532 rcu_assign_pointer(inet->inet_opt, opt); 532 rcu_assign_pointer(inet->inet_opt, opt);
533 if (old) 533 if (old)
534 kfree_rcu(old, rcu); 534 call_rcu(&old->rcu, opt_kfree_rcu);
535 break; 535 break;
536 } 536 }
537 case IP_PKTINFO: 537 case IP_PKTINFO:
@@ -578,8 +578,8 @@ static int do_ip_setsockopt(struct sock *sk, int level,
578 break; 578 break;
579 case IP_TOS: /* This sets both TOS and Precedence */ 579 case IP_TOS: /* This sets both TOS and Precedence */
580 if (sk->sk_type == SOCK_STREAM) { 580 if (sk->sk_type == SOCK_STREAM) {
581 val &= ~INET_ECN_MASK; 581 val &= ~3;
582 val |= inet->tos & INET_ECN_MASK; 582 val |= inet->tos & 3;
583 } 583 }
584 if (inet->tos != val) { 584 if (inet->tos != val) {
585 inet->tos = val; 585 inet->tos = val;
@@ -590,7 +590,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
590 case IP_TTL: 590 case IP_TTL:
591 if (optlen < 1) 591 if (optlen < 1)
592 goto e_inval; 592 goto e_inval;
593 if (val != -1 && (val < 1 || val > 255)) 593 if (val != -1 && (val < 0 || val > 255))
594 goto e_inval; 594 goto e_inval;
595 inet->uc_ttl = val; 595 inet->uc_ttl = val;
596 break; 596 break;
@@ -634,35 +634,6 @@ static int do_ip_setsockopt(struct sock *sk, int level,
634 goto e_inval; 634 goto e_inval;
635 inet->mc_loop = !!val; 635 inet->mc_loop = !!val;
636 break; 636 break;
637 case IP_UNICAST_IF:
638 {
639 struct net_device *dev = NULL;
640 int ifindex;
641
642 if (optlen != sizeof(int))
643 goto e_inval;
644
645 ifindex = (__force int)ntohl((__force __be32)val);
646 if (ifindex == 0) {
647 inet->uc_index = 0;
648 err = 0;
649 break;
650 }
651
652 dev = dev_get_by_index(sock_net(sk), ifindex);
653 err = -EADDRNOTAVAIL;
654 if (!dev)
655 break;
656 dev_put(dev);
657
658 err = -EINVAL;
659 if (sk->sk_bound_dev_if)
660 break;
661
662 inet->uc_index = ifindex;
663 err = 0;
664 break;
665 }
666 case IP_MULTICAST_IF: 637 case IP_MULTICAST_IF:
667 { 638 {
668 struct ip_mreqn mreq; 639 struct ip_mreqn mreq;
@@ -683,15 +654,10 @@ static int do_ip_setsockopt(struct sock *sk, int level,
683 break; 654 break;
684 } else { 655 } else {
685 memset(&mreq, 0, sizeof(mreq)); 656 memset(&mreq, 0, sizeof(mreq));
686 if (optlen >= sizeof(struct ip_mreq)) { 657 if (optlen >= sizeof(struct in_addr) &&
687 if (copy_from_user(&mreq, optval, 658 copy_from_user(&mreq.imr_address, optval,
688 sizeof(struct ip_mreq))) 659 sizeof(struct in_addr)))
689 break; 660 break;
690 } else if (optlen >= sizeof(struct in_addr)) {
691 if (copy_from_user(&mreq.imr_address, optval,
692 sizeof(struct in_addr)))
693 break;
694 }
695 } 661 }
696 662
697 if (!mreq.imr_ifindex) { 663 if (!mreq.imr_ifindex) {
@@ -989,14 +955,13 @@ mc_msf_out:
989 case IP_IPSEC_POLICY: 955 case IP_IPSEC_POLICY:
990 case IP_XFRM_POLICY: 956 case IP_XFRM_POLICY:
991 err = -EPERM; 957 err = -EPERM;
992 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 958 if (!capable(CAP_NET_ADMIN))
993 break; 959 break;
994 err = xfrm_user_policy(sk, optname, optval, optlen); 960 err = xfrm_user_policy(sk, optname, optval, optlen);
995 break; 961 break;
996 962
997 case IP_TRANSPARENT: 963 case IP_TRANSPARENT:
998 if (!!val && !ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && 964 if (!capable(CAP_NET_ADMIN)) {
999 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1000 err = -EPERM; 965 err = -EPERM;
1001 break; 966 break;
1002 } 967 }
@@ -1026,27 +991,20 @@ e_inval:
1026} 991}
1027 992
1028/** 993/**
1029 * ipv4_pktinfo_prepare - transfert some info from rtable to skb 994 * ip_queue_rcv_skb - Queue an skb into sock receive queue
1030 * @sk: socket 995 * @sk: socket
1031 * @skb: buffer 996 * @skb: buffer
1032 * 997 *
1033 * To support IP_CMSG_PKTINFO option, we store rt_iif and specific 998 * Queues an skb into socket receive queue. If IP_CMSG_PKTINFO option
1034 * destination in skb->cb[] before dst drop. 999 * is not set, we drop skb dst entry now, while dst cache line is hot.
1035 * This way, receiver doesnt make cache line misses to read rtable.
1036 */ 1000 */
1037void ipv4_pktinfo_prepare(struct sk_buff *skb) 1001int ip_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1038{ 1002{
1039 struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb); 1003 if (!(inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO))
1040 1004 skb_dst_drop(skb);
1041 if (skb_rtable(skb)) { 1005 return sock_queue_rcv_skb(sk, skb);
1042 pktinfo->ipi_ifindex = inet_iif(skb);
1043 pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
1044 } else {
1045 pktinfo->ipi_ifindex = 0;
1046 pktinfo->ipi_spec_dst.s_addr = 0;
1047 }
1048 skb_dst_drop(skb);
1049} 1006}
1007EXPORT_SYMBOL(ip_queue_rcv_skb);
1050 1008
1051int ip_setsockopt(struct sock *sk, int level, 1009int ip_setsockopt(struct sock *sk, int level,
1052 int optname, char __user *optval, unsigned int optlen) 1010 int optname, char __user *optval, unsigned int optlen)
@@ -1109,7 +1067,7 @@ EXPORT_SYMBOL(compat_ip_setsockopt);
1109 */ 1067 */
1110 1068
1111static int do_ip_getsockopt(struct sock *sk, int level, int optname, 1069static int do_ip_getsockopt(struct sock *sk, int level, int optname,
1112 char __user *optval, int __user *optlen, unsigned int flags) 1070 char __user *optval, int __user *optlen, unsigned flags)
1113{ 1071{
1114 struct inet_sock *inet = inet_sk(sk); 1072 struct inet_sock *inet = inet_sk(sk);
1115 int val; 1073 int val;
@@ -1218,9 +1176,6 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
1218 case IP_MULTICAST_LOOP: 1176 case IP_MULTICAST_LOOP:
1219 val = inet->mc_loop; 1177 val = inet->mc_loop;
1220 break; 1178 break;
1221 case IP_UNICAST_IF:
1222 val = (__force int)htonl((__u32) inet->uc_index);
1223 break;
1224 case IP_MULTICAST_IF: 1179 case IP_MULTICAST_IF:
1225 { 1180 {
1226 struct in_addr addr; 1181 struct in_addr addr;
@@ -1299,10 +1254,6 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
1299 int hlim = inet->mc_ttl; 1254 int hlim = inet->mc_ttl;
1300 put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim); 1255 put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);
1301 } 1256 }
1302 if (inet->cmsg_flags & IP_CMSG_TOS) {
1303 int tos = inet->rcv_tos;
1304 put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos);
1305 }
1306 len -= msg.msg_controllen; 1257 len -= msg.msg_controllen;
1307 return put_user(len, optlen); 1258 return put_user(len, optlen);
1308 } 1259 }
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
deleted file mode 100644
index c3a4233c0ac..00000000000
--- a/net/ipv4/ip_vti.c
+++ /dev/null
@@ -1,942 +0,0 @@
1/*
2 * Linux NET3: IP/IP protocol decoder modified to support
3 * virtual tunnel interface
4 *
5 * Authors:
6 * Saurabh Mohan (saurabh.mohan@vyatta.com) 05/07/2012
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 *
13 */
14
15/*
16 This version of net/ipv4/ip_vti.c is cloned of net/ipv4/ipip.c
17
18 For comments look at net/ipv4/ip_gre.c --ANK
19 */
20
21
22#include <linux/capability.h>
23#include <linux/module.h>
24#include <linux/types.h>
25#include <linux/kernel.h>
26#include <linux/uaccess.h>
27#include <linux/skbuff.h>
28#include <linux/netdevice.h>
29#include <linux/in.h>
30#include <linux/tcp.h>
31#include <linux/udp.h>
32#include <linux/if_arp.h>
33#include <linux/mroute.h>
34#include <linux/init.h>
35#include <linux/netfilter_ipv4.h>
36#include <linux/if_ether.h>
37
38#include <net/sock.h>
39#include <net/ip.h>
40#include <net/icmp.h>
41#include <net/ipip.h>
42#include <net/inet_ecn.h>
43#include <net/xfrm.h>
44#include <net/net_namespace.h>
45#include <net/netns/generic.h>
46
47#define HASH_SIZE 16
48#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&(HASH_SIZE-1))
49
50static struct rtnl_link_ops vti_link_ops __read_mostly;
51
52static int vti_net_id __read_mostly;
53struct vti_net {
54 struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
55 struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
56 struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
57 struct ip_tunnel __rcu *tunnels_wc[1];
58 struct ip_tunnel __rcu **tunnels[4];
59
60 struct net_device *fb_tunnel_dev;
61};
62
63static int vti_fb_tunnel_init(struct net_device *dev);
64static int vti_tunnel_init(struct net_device *dev);
65static void vti_tunnel_setup(struct net_device *dev);
66static void vti_dev_free(struct net_device *dev);
67static int vti_tunnel_bind_dev(struct net_device *dev);
68
69#define VTI_XMIT(stats1, stats2) do { \
70 int err; \
71 int pkt_len = skb->len; \
72 err = dst_output(skb); \
73 if (net_xmit_eval(err) == 0) { \
74 u64_stats_update_begin(&(stats1)->syncp); \
75 (stats1)->tx_bytes += pkt_len; \
76 (stats1)->tx_packets++; \
77 u64_stats_update_end(&(stats1)->syncp); \
78 } else { \
79 (stats2)->tx_errors++; \
80 (stats2)->tx_aborted_errors++; \
81 } \
82} while (0)
83
84
85static struct rtnl_link_stats64 *vti_get_stats64(struct net_device *dev,
86 struct rtnl_link_stats64 *tot)
87{
88 int i;
89
90 for_each_possible_cpu(i) {
91 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
92 u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
93 unsigned int start;
94
95 do {
96 start = u64_stats_fetch_begin_bh(&tstats->syncp);
97 rx_packets = tstats->rx_packets;
98 tx_packets = tstats->tx_packets;
99 rx_bytes = tstats->rx_bytes;
100 tx_bytes = tstats->tx_bytes;
101 } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
102
103 tot->rx_packets += rx_packets;
104 tot->tx_packets += tx_packets;
105 tot->rx_bytes += rx_bytes;
106 tot->tx_bytes += tx_bytes;
107 }
108
109 tot->multicast = dev->stats.multicast;
110 tot->rx_crc_errors = dev->stats.rx_crc_errors;
111 tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
112 tot->rx_length_errors = dev->stats.rx_length_errors;
113 tot->rx_errors = dev->stats.rx_errors;
114 tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
115 tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
116 tot->tx_dropped = dev->stats.tx_dropped;
117 tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
118 tot->tx_errors = dev->stats.tx_errors;
119
120 return tot;
121}
122
123static struct ip_tunnel *vti_tunnel_lookup(struct net *net,
124 __be32 remote, __be32 local)
125{
126 unsigned h0 = HASH(remote);
127 unsigned h1 = HASH(local);
128 struct ip_tunnel *t;
129 struct vti_net *ipn = net_generic(net, vti_net_id);
130
131 for_each_ip_tunnel_rcu(t, ipn->tunnels_r_l[h0 ^ h1])
132 if (local == t->parms.iph.saddr &&
133 remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
134 return t;
135 for_each_ip_tunnel_rcu(t, ipn->tunnels_r[h0])
136 if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
137 return t;
138
139 for_each_ip_tunnel_rcu(t, ipn->tunnels_l[h1])
140 if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
141 return t;
142
143 for_each_ip_tunnel_rcu(t, ipn->tunnels_wc[0])
144 if (t && (t->dev->flags&IFF_UP))
145 return t;
146 return NULL;
147}
148
149static struct ip_tunnel __rcu **__vti_bucket(struct vti_net *ipn,
150 struct ip_tunnel_parm *parms)
151{
152 __be32 remote = parms->iph.daddr;
153 __be32 local = parms->iph.saddr;
154 unsigned h = 0;
155 int prio = 0;
156
157 if (remote) {
158 prio |= 2;
159 h ^= HASH(remote);
160 }
161 if (local) {
162 prio |= 1;
163 h ^= HASH(local);
164 }
165 return &ipn->tunnels[prio][h];
166}
167
168static inline struct ip_tunnel __rcu **vti_bucket(struct vti_net *ipn,
169 struct ip_tunnel *t)
170{
171 return __vti_bucket(ipn, &t->parms);
172}
173
174static void vti_tunnel_unlink(struct vti_net *ipn, struct ip_tunnel *t)
175{
176 struct ip_tunnel __rcu **tp;
177 struct ip_tunnel *iter;
178
179 for (tp = vti_bucket(ipn, t);
180 (iter = rtnl_dereference(*tp)) != NULL;
181 tp = &iter->next) {
182 if (t == iter) {
183 rcu_assign_pointer(*tp, t->next);
184 break;
185 }
186 }
187}
188
189static void vti_tunnel_link(struct vti_net *ipn, struct ip_tunnel *t)
190{
191 struct ip_tunnel __rcu **tp = vti_bucket(ipn, t);
192
193 rcu_assign_pointer(t->next, rtnl_dereference(*tp));
194 rcu_assign_pointer(*tp, t);
195}
196
197static struct ip_tunnel *vti_tunnel_locate(struct net *net,
198 struct ip_tunnel_parm *parms,
199 int create)
200{
201 __be32 remote = parms->iph.daddr;
202 __be32 local = parms->iph.saddr;
203 struct ip_tunnel *t, *nt;
204 struct ip_tunnel __rcu **tp;
205 struct net_device *dev;
206 char name[IFNAMSIZ];
207 struct vti_net *ipn = net_generic(net, vti_net_id);
208
209 for (tp = __vti_bucket(ipn, parms);
210 (t = rtnl_dereference(*tp)) != NULL;
211 tp = &t->next) {
212 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
213 return t;
214 }
215 if (!create)
216 return NULL;
217
218 if (parms->name[0])
219 strlcpy(name, parms->name, IFNAMSIZ);
220 else
221 strcpy(name, "vti%d");
222
223 dev = alloc_netdev(sizeof(*t), name, vti_tunnel_setup);
224 if (dev == NULL)
225 return NULL;
226
227 dev_net_set(dev, net);
228
229 nt = netdev_priv(dev);
230 nt->parms = *parms;
231 dev->rtnl_link_ops = &vti_link_ops;
232
233 vti_tunnel_bind_dev(dev);
234
235 if (register_netdevice(dev) < 0)
236 goto failed_free;
237
238 dev_hold(dev);
239 vti_tunnel_link(ipn, nt);
240 return nt;
241
242failed_free:
243 free_netdev(dev);
244 return NULL;
245}
246
247static void vti_tunnel_uninit(struct net_device *dev)
248{
249 struct net *net = dev_net(dev);
250 struct vti_net *ipn = net_generic(net, vti_net_id);
251
252 vti_tunnel_unlink(ipn, netdev_priv(dev));
253 dev_put(dev);
254}
255
256static int vti_err(struct sk_buff *skb, u32 info)
257{
258
259 /* All the routers (except for Linux) return only
260 * 8 bytes of packet payload. It means, that precise relaying of
261 * ICMP in the real Internet is absolutely infeasible.
262 */
263 struct iphdr *iph = (struct iphdr *)skb->data;
264 const int type = icmp_hdr(skb)->type;
265 const int code = icmp_hdr(skb)->code;
266 struct ip_tunnel *t;
267 int err;
268
269 switch (type) {
270 default:
271 case ICMP_PARAMETERPROB:
272 return 0;
273
274 case ICMP_DEST_UNREACH:
275 switch (code) {
276 case ICMP_SR_FAILED:
277 case ICMP_PORT_UNREACH:
278 /* Impossible event. */
279 return 0;
280 default:
281 /* All others are translated to HOST_UNREACH. */
282 break;
283 }
284 break;
285 case ICMP_TIME_EXCEEDED:
286 if (code != ICMP_EXC_TTL)
287 return 0;
288 break;
289 }
290
291 err = -ENOENT;
292
293 t = vti_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
294 if (t == NULL)
295 goto out;
296
297 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
298 ipv4_update_pmtu(skb, dev_net(skb->dev), info,
299 t->parms.link, 0, IPPROTO_IPIP, 0);
300 err = 0;
301 goto out;
302 }
303
304 err = 0;
305 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
306 goto out;
307
308 if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
309 t->err_count++;
310 else
311 t->err_count = 1;
312 t->err_time = jiffies;
313out:
314 return err;
315}
316
317/* We dont digest the packet therefore let the packet pass */
318static int vti_rcv(struct sk_buff *skb)
319{
320 struct ip_tunnel *tunnel;
321 const struct iphdr *iph = ip_hdr(skb);
322
323 tunnel = vti_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
324 if (tunnel != NULL) {
325 struct pcpu_tstats *tstats;
326
327 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
328 return -1;
329
330 tstats = this_cpu_ptr(tunnel->dev->tstats);
331 u64_stats_update_begin(&tstats->syncp);
332 tstats->rx_packets++;
333 tstats->rx_bytes += skb->len;
334 u64_stats_update_end(&tstats->syncp);
335
336 skb->mark = 0;
337 secpath_reset(skb);
338 skb->dev = tunnel->dev;
339 return 1;
340 }
341
342 return -1;
343}
344
345/* This function assumes it is being called from dev_queue_xmit()
346 * and that skb is filled properly by that function.
347 */
348
349static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
350{
351 struct ip_tunnel *tunnel = netdev_priv(dev);
352 struct pcpu_tstats *tstats;
353 struct iphdr *tiph = &tunnel->parms.iph;
354 u8 tos;
355 struct rtable *rt; /* Route to the other host */
356 struct net_device *tdev; /* Device to other host */
357 struct iphdr *old_iph = ip_hdr(skb);
358 __be32 dst = tiph->daddr;
359 struct flowi4 fl4;
360
361 if (skb->protocol != htons(ETH_P_IP))
362 goto tx_error;
363
364 tos = old_iph->tos;
365
366 memset(&fl4, 0, sizeof(fl4));
367 flowi4_init_output(&fl4, tunnel->parms.link,
368 be32_to_cpu(tunnel->parms.i_key), RT_TOS(tos),
369 RT_SCOPE_UNIVERSE,
370 IPPROTO_IPIP, 0,
371 dst, tiph->saddr, 0, 0);
372 rt = ip_route_output_key(dev_net(dev), &fl4);
373 if (IS_ERR(rt)) {
374 dev->stats.tx_carrier_errors++;
375 goto tx_error_icmp;
376 }
377 /* if there is no transform then this tunnel is not functional.
378 * Or if the xfrm is not mode tunnel.
379 */
380 if (!rt->dst.xfrm ||
381 rt->dst.xfrm->props.mode != XFRM_MODE_TUNNEL) {
382 dev->stats.tx_carrier_errors++;
383 goto tx_error_icmp;
384 }
385 tdev = rt->dst.dev;
386
387 if (tdev == dev) {
388 ip_rt_put(rt);
389 dev->stats.collisions++;
390 goto tx_error;
391 }
392
393 if (tunnel->err_count > 0) {
394 if (time_before(jiffies,
395 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
396 tunnel->err_count--;
397 dst_link_failure(skb);
398 } else
399 tunnel->err_count = 0;
400 }
401
402 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
403 IPSKB_REROUTED);
404 skb_dst_drop(skb);
405 skb_dst_set(skb, &rt->dst);
406 nf_reset(skb);
407 skb->dev = skb_dst(skb)->dev;
408
409 tstats = this_cpu_ptr(dev->tstats);
410 VTI_XMIT(tstats, &dev->stats);
411 return NETDEV_TX_OK;
412
413tx_error_icmp:
414 dst_link_failure(skb);
415tx_error:
416 dev->stats.tx_errors++;
417 dev_kfree_skb(skb);
418 return NETDEV_TX_OK;
419}
420
421static int vti_tunnel_bind_dev(struct net_device *dev)
422{
423 struct net_device *tdev = NULL;
424 struct ip_tunnel *tunnel;
425 struct iphdr *iph;
426
427 tunnel = netdev_priv(dev);
428 iph = &tunnel->parms.iph;
429
430 if (iph->daddr) {
431 struct rtable *rt;
432 struct flowi4 fl4;
433 memset(&fl4, 0, sizeof(fl4));
434 flowi4_init_output(&fl4, tunnel->parms.link,
435 be32_to_cpu(tunnel->parms.i_key),
436 RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
437 IPPROTO_IPIP, 0,
438 iph->daddr, iph->saddr, 0, 0);
439 rt = ip_route_output_key(dev_net(dev), &fl4);
440 if (!IS_ERR(rt)) {
441 tdev = rt->dst.dev;
442 ip_rt_put(rt);
443 }
444 dev->flags |= IFF_POINTOPOINT;
445 }
446
447 if (!tdev && tunnel->parms.link)
448 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
449
450 if (tdev) {
451 dev->hard_header_len = tdev->hard_header_len +
452 sizeof(struct iphdr);
453 dev->mtu = tdev->mtu;
454 }
455 dev->iflink = tunnel->parms.link;
456 return dev->mtu;
457}
458
459static int
460vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
461{
462 int err = 0;
463 struct ip_tunnel_parm p;
464 struct ip_tunnel *t;
465 struct net *net = dev_net(dev);
466 struct vti_net *ipn = net_generic(net, vti_net_id);
467
468 switch (cmd) {
469 case SIOCGETTUNNEL:
470 t = NULL;
471 if (dev == ipn->fb_tunnel_dev) {
472 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data,
473 sizeof(p))) {
474 err = -EFAULT;
475 break;
476 }
477 t = vti_tunnel_locate(net, &p, 0);
478 }
479 if (t == NULL)
480 t = netdev_priv(dev);
481 memcpy(&p, &t->parms, sizeof(p));
482 p.i_flags |= GRE_KEY | VTI_ISVTI;
483 p.o_flags |= GRE_KEY;
484 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
485 err = -EFAULT;
486 break;
487
488 case SIOCADDTUNNEL:
489 case SIOCCHGTUNNEL:
490 err = -EPERM;
491 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
492 goto done;
493
494 err = -EFAULT;
495 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
496 goto done;
497
498 err = -EINVAL;
499 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
500 p.iph.ihl != 5)
501 goto done;
502
503 t = vti_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
504
505 if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
506 if (t != NULL) {
507 if (t->dev != dev) {
508 err = -EEXIST;
509 break;
510 }
511 } else {
512 if (((dev->flags&IFF_POINTOPOINT) &&
513 !p.iph.daddr) ||
514 (!(dev->flags&IFF_POINTOPOINT) &&
515 p.iph.daddr)) {
516 err = -EINVAL;
517 break;
518 }
519 t = netdev_priv(dev);
520 vti_tunnel_unlink(ipn, t);
521 synchronize_net();
522 t->parms.iph.saddr = p.iph.saddr;
523 t->parms.iph.daddr = p.iph.daddr;
524 t->parms.i_key = p.i_key;
525 t->parms.o_key = p.o_key;
526 t->parms.iph.protocol = IPPROTO_IPIP;
527 memcpy(dev->dev_addr, &p.iph.saddr, 4);
528 memcpy(dev->broadcast, &p.iph.daddr, 4);
529 vti_tunnel_link(ipn, t);
530 netdev_state_change(dev);
531 }
532 }
533
534 if (t) {
535 err = 0;
536 if (cmd == SIOCCHGTUNNEL) {
537 t->parms.i_key = p.i_key;
538 t->parms.o_key = p.o_key;
539 if (t->parms.link != p.link) {
540 t->parms.link = p.link;
541 vti_tunnel_bind_dev(dev);
542 netdev_state_change(dev);
543 }
544 }
545 p.i_flags |= GRE_KEY | VTI_ISVTI;
546 p.o_flags |= GRE_KEY;
547 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms,
548 sizeof(p)))
549 err = -EFAULT;
550 } else
551 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
552 break;
553
554 case SIOCDELTUNNEL:
555 err = -EPERM;
556 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
557 goto done;
558
559 if (dev == ipn->fb_tunnel_dev) {
560 err = -EFAULT;
561 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data,
562 sizeof(p)))
563 goto done;
564 err = -ENOENT;
565
566 t = vti_tunnel_locate(net, &p, 0);
567 if (t == NULL)
568 goto done;
569 err = -EPERM;
570 if (t->dev == ipn->fb_tunnel_dev)
571 goto done;
572 dev = t->dev;
573 }
574 unregister_netdevice(dev);
575 err = 0;
576 break;
577
578 default:
579 err = -EINVAL;
580 }
581
582done:
583 return err;
584}
585
586static int vti_tunnel_change_mtu(struct net_device *dev, int new_mtu)
587{
588 if (new_mtu < 68 || new_mtu > 0xFFF8)
589 return -EINVAL;
590 dev->mtu = new_mtu;
591 return 0;
592}
593
594static const struct net_device_ops vti_netdev_ops = {
595 .ndo_init = vti_tunnel_init,
596 .ndo_uninit = vti_tunnel_uninit,
597 .ndo_start_xmit = vti_tunnel_xmit,
598 .ndo_do_ioctl = vti_tunnel_ioctl,
599 .ndo_change_mtu = vti_tunnel_change_mtu,
600 .ndo_get_stats64 = vti_get_stats64,
601};
602
603static void vti_dev_free(struct net_device *dev)
604{
605 free_percpu(dev->tstats);
606 free_netdev(dev);
607}
608
609static void vti_tunnel_setup(struct net_device *dev)
610{
611 dev->netdev_ops = &vti_netdev_ops;
612 dev->destructor = vti_dev_free;
613
614 dev->type = ARPHRD_TUNNEL;
615 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
616 dev->mtu = ETH_DATA_LEN;
617 dev->flags = IFF_NOARP;
618 dev->iflink = 0;
619 dev->addr_len = 4;
620 dev->features |= NETIF_F_NETNS_LOCAL;
621 dev->features |= NETIF_F_LLTX;
622 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
623}
624
625static int vti_tunnel_init(struct net_device *dev)
626{
627 struct ip_tunnel *tunnel = netdev_priv(dev);
628
629 tunnel->dev = dev;
630 strcpy(tunnel->parms.name, dev->name);
631
632 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
633 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
634
635 dev->tstats = alloc_percpu(struct pcpu_tstats);
636 if (!dev->tstats)
637 return -ENOMEM;
638
639 return 0;
640}
641
642static int __net_init vti_fb_tunnel_init(struct net_device *dev)
643{
644 struct ip_tunnel *tunnel = netdev_priv(dev);
645 struct iphdr *iph = &tunnel->parms.iph;
646 struct vti_net *ipn = net_generic(dev_net(dev), vti_net_id);
647
648 tunnel->dev = dev;
649 strcpy(tunnel->parms.name, dev->name);
650
651 iph->version = 4;
652 iph->protocol = IPPROTO_IPIP;
653 iph->ihl = 5;
654
655 dev->tstats = alloc_percpu(struct pcpu_tstats);
656 if (!dev->tstats)
657 return -ENOMEM;
658
659 dev_hold(dev);
660 rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
661 return 0;
662}
663
664static struct xfrm_tunnel vti_handler __read_mostly = {
665 .handler = vti_rcv,
666 .err_handler = vti_err,
667 .priority = 1,
668};
669
670static void vti_destroy_tunnels(struct vti_net *ipn, struct list_head *head)
671{
672 int prio;
673
674 for (prio = 1; prio < 4; prio++) {
675 int h;
676 for (h = 0; h < HASH_SIZE; h++) {
677 struct ip_tunnel *t;
678
679 t = rtnl_dereference(ipn->tunnels[prio][h]);
680 while (t != NULL) {
681 unregister_netdevice_queue(t->dev, head);
682 t = rtnl_dereference(t->next);
683 }
684 }
685 }
686}
687
688static int __net_init vti_init_net(struct net *net)
689{
690 int err;
691 struct vti_net *ipn = net_generic(net, vti_net_id);
692
693 ipn->tunnels[0] = ipn->tunnels_wc;
694 ipn->tunnels[1] = ipn->tunnels_l;
695 ipn->tunnels[2] = ipn->tunnels_r;
696 ipn->tunnels[3] = ipn->tunnels_r_l;
697
698 ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel),
699 "ip_vti0",
700 vti_tunnel_setup);
701 if (!ipn->fb_tunnel_dev) {
702 err = -ENOMEM;
703 goto err_alloc_dev;
704 }
705 dev_net_set(ipn->fb_tunnel_dev, net);
706
707 err = vti_fb_tunnel_init(ipn->fb_tunnel_dev);
708 if (err)
709 goto err_reg_dev;
710 ipn->fb_tunnel_dev->rtnl_link_ops = &vti_link_ops;
711
712 err = register_netdev(ipn->fb_tunnel_dev);
713 if (err)
714 goto err_reg_dev;
715 return 0;
716
717err_reg_dev:
718 vti_dev_free(ipn->fb_tunnel_dev);
719err_alloc_dev:
720 /* nothing */
721 return err;
722}
723
724static void __net_exit vti_exit_net(struct net *net)
725{
726 struct vti_net *ipn = net_generic(net, vti_net_id);
727 LIST_HEAD(list);
728
729 rtnl_lock();
730 vti_destroy_tunnels(ipn, &list);
731 unregister_netdevice_many(&list);
732 rtnl_unlock();
733}
734
735static struct pernet_operations vti_net_ops = {
736 .init = vti_init_net,
737 .exit = vti_exit_net,
738 .id = &vti_net_id,
739 .size = sizeof(struct vti_net),
740};
741
742static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
743{
744 return 0;
745}
746
747static void vti_netlink_parms(struct nlattr *data[],
748 struct ip_tunnel_parm *parms)
749{
750 memset(parms, 0, sizeof(*parms));
751
752 parms->iph.protocol = IPPROTO_IPIP;
753
754 if (!data)
755 return;
756
757 if (data[IFLA_VTI_LINK])
758 parms->link = nla_get_u32(data[IFLA_VTI_LINK]);
759
760 if (data[IFLA_VTI_IKEY])
761 parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]);
762
763 if (data[IFLA_VTI_OKEY])
764 parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]);
765
766 if (data[IFLA_VTI_LOCAL])
767 parms->iph.saddr = nla_get_be32(data[IFLA_VTI_LOCAL]);
768
769 if (data[IFLA_VTI_REMOTE])
770 parms->iph.daddr = nla_get_be32(data[IFLA_VTI_REMOTE]);
771
772}
773
774static int vti_newlink(struct net *src_net, struct net_device *dev,
775 struct nlattr *tb[], struct nlattr *data[])
776{
777 struct ip_tunnel *nt;
778 struct net *net = dev_net(dev);
779 struct vti_net *ipn = net_generic(net, vti_net_id);
780 int mtu;
781 int err;
782
783 nt = netdev_priv(dev);
784 vti_netlink_parms(data, &nt->parms);
785
786 if (vti_tunnel_locate(net, &nt->parms, 0))
787 return -EEXIST;
788
789 mtu = vti_tunnel_bind_dev(dev);
790 if (!tb[IFLA_MTU])
791 dev->mtu = mtu;
792
793 err = register_netdevice(dev);
794 if (err)
795 goto out;
796
797 dev_hold(dev);
798 vti_tunnel_link(ipn, nt);
799
800out:
801 return err;
802}
803
804static int vti_changelink(struct net_device *dev, struct nlattr *tb[],
805 struct nlattr *data[])
806{
807 struct ip_tunnel *t, *nt;
808 struct net *net = dev_net(dev);
809 struct vti_net *ipn = net_generic(net, vti_net_id);
810 struct ip_tunnel_parm p;
811 int mtu;
812
813 if (dev == ipn->fb_tunnel_dev)
814 return -EINVAL;
815
816 nt = netdev_priv(dev);
817 vti_netlink_parms(data, &p);
818
819 t = vti_tunnel_locate(net, &p, 0);
820
821 if (t) {
822 if (t->dev != dev)
823 return -EEXIST;
824 } else {
825 t = nt;
826
827 vti_tunnel_unlink(ipn, t);
828 t->parms.iph.saddr = p.iph.saddr;
829 t->parms.iph.daddr = p.iph.daddr;
830 t->parms.i_key = p.i_key;
831 t->parms.o_key = p.o_key;
832 if (dev->type != ARPHRD_ETHER) {
833 memcpy(dev->dev_addr, &p.iph.saddr, 4);
834 memcpy(dev->broadcast, &p.iph.daddr, 4);
835 }
836 vti_tunnel_link(ipn, t);
837 netdev_state_change(dev);
838 }
839
840 if (t->parms.link != p.link) {
841 t->parms.link = p.link;
842 mtu = vti_tunnel_bind_dev(dev);
843 if (!tb[IFLA_MTU])
844 dev->mtu = mtu;
845 netdev_state_change(dev);
846 }
847
848 return 0;
849}
850
851static size_t vti_get_size(const struct net_device *dev)
852{
853 return
854 /* IFLA_VTI_LINK */
855 nla_total_size(4) +
856 /* IFLA_VTI_IKEY */
857 nla_total_size(4) +
858 /* IFLA_VTI_OKEY */
859 nla_total_size(4) +
860 /* IFLA_VTI_LOCAL */
861 nla_total_size(4) +
862 /* IFLA_VTI_REMOTE */
863 nla_total_size(4) +
864 0;
865}
866
867static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev)
868{
869 struct ip_tunnel *t = netdev_priv(dev);
870 struct ip_tunnel_parm *p = &t->parms;
871
872 nla_put_u32(skb, IFLA_VTI_LINK, p->link);
873 nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key);
874 nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key);
875 nla_put_be32(skb, IFLA_VTI_LOCAL, p->iph.saddr);
876 nla_put_be32(skb, IFLA_VTI_REMOTE, p->iph.daddr);
877
878 return 0;
879}
880
881static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = {
882 [IFLA_VTI_LINK] = { .type = NLA_U32 },
883 [IFLA_VTI_IKEY] = { .type = NLA_U32 },
884 [IFLA_VTI_OKEY] = { .type = NLA_U32 },
885 [IFLA_VTI_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
886 [IFLA_VTI_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
887};
888
889static struct rtnl_link_ops vti_link_ops __read_mostly = {
890 .kind = "vti",
891 .maxtype = IFLA_VTI_MAX,
892 .policy = vti_policy,
893 .priv_size = sizeof(struct ip_tunnel),
894 .setup = vti_tunnel_setup,
895 .validate = vti_tunnel_validate,
896 .newlink = vti_newlink,
897 .changelink = vti_changelink,
898 .get_size = vti_get_size,
899 .fill_info = vti_fill_info,
900};
901
902static int __init vti_init(void)
903{
904 int err;
905
906 pr_info("IPv4 over IPSec tunneling driver\n");
907
908 err = register_pernet_device(&vti_net_ops);
909 if (err < 0)
910 return err;
911 err = xfrm4_mode_tunnel_input_register(&vti_handler);
912 if (err < 0) {
913 unregister_pernet_device(&vti_net_ops);
914 pr_info(KERN_INFO "vti init: can't register tunnel\n");
915 }
916
917 err = rtnl_link_register(&vti_link_ops);
918 if (err < 0)
919 goto rtnl_link_failed;
920
921 return err;
922
923rtnl_link_failed:
924 xfrm4_mode_tunnel_input_deregister(&vti_handler);
925 unregister_pernet_device(&vti_net_ops);
926 return err;
927}
928
929static void __exit vti_fini(void)
930{
931 rtnl_link_unregister(&vti_link_ops);
932 if (xfrm4_mode_tunnel_input_deregister(&vti_handler))
933 pr_info("vti close: can't deregister tunnel\n");
934
935 unregister_pernet_device(&vti_net_ops);
936}
937
938module_init(vti_init);
939module_exit(vti_fini);
940MODULE_LICENSE("GPL");
941MODULE_ALIAS_RTNL_LINK("vti");
942MODULE_ALIAS_NETDEV("ip_vti0");
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index d3ab47e19a8..c857f6f49b0 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -31,26 +31,17 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
31 struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); 31 struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
32 struct xfrm_state *x; 32 struct xfrm_state *x;
33 33
34 switch (icmp_hdr(skb)->type) { 34 if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH ||
35 case ICMP_DEST_UNREACH: 35 icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
36 if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
37 return;
38 case ICMP_REDIRECT:
39 break;
40 default:
41 return; 36 return;
42 }
43 37
44 spi = htonl(ntohs(ipch->cpi)); 38 spi = htonl(ntohs(ipch->cpi));
45 x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, 39 x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
46 spi, IPPROTO_COMP, AF_INET); 40 spi, IPPROTO_COMP, AF_INET);
47 if (!x) 41 if (!x)
48 return; 42 return;
49 43 NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%pI4\n",
50 if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) 44 spi, &iph->daddr);
51 ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0);
52 else
53 ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0);
54 xfrm_state_put(x); 45 xfrm_state_put(x);
55} 46}
56 47
@@ -165,11 +156,11 @@ static const struct net_protocol ipcomp4_protocol = {
165static int __init ipcomp4_init(void) 156static int __init ipcomp4_init(void)
166{ 157{
167 if (xfrm_register_type(&ipcomp_type, AF_INET) < 0) { 158 if (xfrm_register_type(&ipcomp_type, AF_INET) < 0) {
168 pr_info("%s: can't add xfrm type\n", __func__); 159 printk(KERN_INFO "ipcomp init: can't add xfrm type\n");
169 return -EAGAIN; 160 return -EAGAIN;
170 } 161 }
171 if (inet_add_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) { 162 if (inet_add_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) {
172 pr_info("%s: can't add protocol\n", __func__); 163 printk(KERN_INFO "ipcomp init: can't add protocol\n");
173 xfrm_unregister_type(&ipcomp_type, AF_INET); 164 xfrm_unregister_type(&ipcomp_type, AF_INET);
174 return -EAGAIN; 165 return -EAGAIN;
175 } 166 }
@@ -179,9 +170,9 @@ static int __init ipcomp4_init(void)
179static void __exit ipcomp4_fini(void) 170static void __exit ipcomp4_fini(void)
180{ 171{
181 if (inet_del_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) 172 if (inet_del_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0)
182 pr_info("%s: can't remove protocol\n", __func__); 173 printk(KERN_INFO "ip ipcomp close: can't remove protocol\n");
183 if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0) 174 if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0)
184 pr_info("%s: can't remove xfrm type\n", __func__); 175 printk(KERN_INFO "ip ipcomp close: can't remove xfrm type\n");
185} 176}
186 177
187module_init(ipcomp4_init); 178module_init(ipcomp4_init);
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index a2e50ae80b5..004bb74b41c 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -54,7 +54,6 @@
54#include <linux/delay.h> 54#include <linux/delay.h>
55#include <linux/nfs_fs.h> 55#include <linux/nfs_fs.h>
56#include <linux/slab.h> 56#include <linux/slab.h>
57#include <linux/export.h>
58#include <net/net_namespace.h> 57#include <net/net_namespace.h>
59#include <net/arp.h> 58#include <net/arp.h>
60#include <net/ip.h> 59#include <net/ip.h>
@@ -136,14 +135,12 @@ __be32 ic_myaddr = NONE; /* My IP address */
136static __be32 ic_netmask = NONE; /* Netmask for local subnet */ 135static __be32 ic_netmask = NONE; /* Netmask for local subnet */
137__be32 ic_gateway = NONE; /* Gateway IP address */ 136__be32 ic_gateway = NONE; /* Gateway IP address */
138 137
139__be32 ic_addrservaddr = NONE; /* IP Address of the IP addresses'server */
140
141__be32 ic_servaddr = NONE; /* Boot server IP address */ 138__be32 ic_servaddr = NONE; /* Boot server IP address */
142 139
143__be32 root_server_addr = NONE; /* Address of NFS server */ 140__be32 root_server_addr = NONE; /* Address of NFS server */
144u8 root_server_path[256] = { 0, }; /* Path to mount as root */ 141u8 root_server_path[256] = { 0, }; /* Path to mount as root */
145 142
146__be32 ic_dev_xid; /* Device under configuration */ 143u32 ic_dev_xid; /* Device under configuration */
147 144
148/* vendor class identifier */ 145/* vendor class identifier */
149static char vendor_class_identifier[253] __initdata; 146static char vendor_class_identifier[253] __initdata;
@@ -216,7 +213,7 @@ static int __init ic_open_devs(void)
216 if (!(dev->flags & IFF_LOOPBACK)) 213 if (!(dev->flags & IFF_LOOPBACK))
217 continue; 214 continue;
218 if (dev_change_flags(dev, dev->flags | IFF_UP) < 0) 215 if (dev_change_flags(dev, dev->flags | IFF_UP) < 0)
219 pr_err("IP-Config: Failed to open %s\n", dev->name); 216 printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name);
220 } 217 }
221 218
222 for_each_netdev(&init_net, dev) { 219 for_each_netdev(&init_net, dev) {
@@ -225,8 +222,7 @@ static int __init ic_open_devs(void)
225 if (dev->mtu >= 364) 222 if (dev->mtu >= 364)
226 able |= IC_BOOTP; 223 able |= IC_BOOTP;
227 else 224 else
228 pr_warn("DHCP/BOOTP: Ignoring device %s, MTU %d too small", 225 printk(KERN_WARNING "DHCP/BOOTP: Ignoring device %s, MTU %d too small", dev->name, dev->mtu);
229 dev->name, dev->mtu);
230 if (!(dev->flags & IFF_NOARP)) 226 if (!(dev->flags & IFF_NOARP))
231 able |= IC_RARP; 227 able |= IC_RARP;
232 able &= ic_proto_enabled; 228 able &= ic_proto_enabled;
@@ -234,8 +230,7 @@ static int __init ic_open_devs(void)
234 continue; 230 continue;
235 oflags = dev->flags; 231 oflags = dev->flags;
236 if (dev_change_flags(dev, oflags | IFF_UP) < 0) { 232 if (dev_change_flags(dev, oflags | IFF_UP) < 0) {
237 pr_err("IP-Config: Failed to open %s\n", 233 printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name);
238 dev->name);
239 continue; 234 continue;
240 } 235 }
241 if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) { 236 if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) {
@@ -277,10 +272,9 @@ have_carrier:
277 272
278 if (!ic_first_dev) { 273 if (!ic_first_dev) {
279 if (user_dev_name[0]) 274 if (user_dev_name[0])
280 pr_err("IP-Config: Device `%s' not found\n", 275 printk(KERN_ERR "IP-Config: Device `%s' not found.\n", user_dev_name);
281 user_dev_name);
282 else 276 else
283 pr_err("IP-Config: No network devices available\n"); 277 printk(KERN_ERR "IP-Config: No network devices available.\n");
284 return -ENODEV; 278 return -ENODEV;
285 } 279 }
286 return 0; 280 return 0;
@@ -364,20 +358,17 @@ static int __init ic_setup_if(void)
364 strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->name); 358 strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->name);
365 set_sockaddr(sin, ic_myaddr, 0); 359 set_sockaddr(sin, ic_myaddr, 0);
366 if ((err = ic_devinet_ioctl(SIOCSIFADDR, &ir)) < 0) { 360 if ((err = ic_devinet_ioctl(SIOCSIFADDR, &ir)) < 0) {
367 pr_err("IP-Config: Unable to set interface address (%d)\n", 361 printk(KERN_ERR "IP-Config: Unable to set interface address (%d).\n", err);
368 err);
369 return -1; 362 return -1;
370 } 363 }
371 set_sockaddr(sin, ic_netmask, 0); 364 set_sockaddr(sin, ic_netmask, 0);
372 if ((err = ic_devinet_ioctl(SIOCSIFNETMASK, &ir)) < 0) { 365 if ((err = ic_devinet_ioctl(SIOCSIFNETMASK, &ir)) < 0) {
373 pr_err("IP-Config: Unable to set interface netmask (%d)\n", 366 printk(KERN_ERR "IP-Config: Unable to set interface netmask (%d).\n", err);
374 err);
375 return -1; 367 return -1;
376 } 368 }
377 set_sockaddr(sin, ic_myaddr | ~ic_netmask, 0); 369 set_sockaddr(sin, ic_myaddr | ~ic_netmask, 0);
378 if ((err = ic_devinet_ioctl(SIOCSIFBRDADDR, &ir)) < 0) { 370 if ((err = ic_devinet_ioctl(SIOCSIFBRDADDR, &ir)) < 0) {
379 pr_err("IP-Config: Unable to set interface broadcast address (%d)\n", 371 printk(KERN_ERR "IP-Config: Unable to set interface broadcast address (%d).\n", err);
380 err);
381 return -1; 372 return -1;
382 } 373 }
383 /* Handle the case where we need non-standard MTU on the boot link (a network 374 /* Handle the case where we need non-standard MTU on the boot link (a network
@@ -388,8 +379,8 @@ static int __init ic_setup_if(void)
388 strcpy(ir.ifr_name, ic_dev->name); 379 strcpy(ir.ifr_name, ic_dev->name);
389 ir.ifr_mtu = ic_dev_mtu; 380 ir.ifr_mtu = ic_dev_mtu;
390 if ((err = ic_dev_ioctl(SIOCSIFMTU, &ir)) < 0) 381 if ((err = ic_dev_ioctl(SIOCSIFMTU, &ir)) < 0)
391 pr_err("IP-Config: Unable to set interface mtu to %d (%d)\n", 382 printk(KERN_ERR "IP-Config: Unable to set interface mtu to %d (%d).\n",
392 ic_dev_mtu, err); 383 ic_dev_mtu, err);
393 } 384 }
394 return 0; 385 return 0;
395} 386}
@@ -404,7 +395,7 @@ static int __init ic_setup_routes(void)
404 395
405 memset(&rm, 0, sizeof(rm)); 396 memset(&rm, 0, sizeof(rm));
406 if ((ic_gateway ^ ic_myaddr) & ic_netmask) { 397 if ((ic_gateway ^ ic_myaddr) & ic_netmask) {
407 pr_err("IP-Config: Gateway not on directly connected network\n"); 398 printk(KERN_ERR "IP-Config: Gateway not on directly connected network.\n");
408 return -1; 399 return -1;
409 } 400 }
410 set_sockaddr((struct sockaddr_in *) &rm.rt_dst, 0, 0); 401 set_sockaddr((struct sockaddr_in *) &rm.rt_dst, 0, 0);
@@ -412,8 +403,7 @@ static int __init ic_setup_routes(void)
412 set_sockaddr((struct sockaddr_in *) &rm.rt_gateway, ic_gateway, 0); 403 set_sockaddr((struct sockaddr_in *) &rm.rt_gateway, ic_gateway, 0);
413 rm.rt_flags = RTF_UP | RTF_GATEWAY; 404 rm.rt_flags = RTF_UP | RTF_GATEWAY;
414 if ((err = ic_route_ioctl(SIOCADDRT, &rm)) < 0) { 405 if ((err = ic_route_ioctl(SIOCADDRT, &rm)) < 0) {
415 pr_err("IP-Config: Cannot add default route (%d)\n", 406 printk(KERN_ERR "IP-Config: Cannot add default route (%d).\n", err);
416 err);
417 return -1; 407 return -1;
418 } 408 }
419 } 409 }
@@ -446,8 +436,8 @@ static int __init ic_defaults(void)
446 else if (IN_CLASSC(ntohl(ic_myaddr))) 436 else if (IN_CLASSC(ntohl(ic_myaddr)))
447 ic_netmask = htonl(IN_CLASSC_NET); 437 ic_netmask = htonl(IN_CLASSC_NET);
448 else { 438 else {
449 pr_err("IP-Config: Unable to guess netmask for address %pI4\n", 439 printk(KERN_ERR "IP-Config: Unable to guess netmask for address %pI4\n",
450 &ic_myaddr); 440 &ic_myaddr);
451 return -1; 441 return -1;
452 } 442 }
453 printk("IP-Config: Guessing netmask %pI4\n", &ic_netmask); 443 printk("IP-Config: Guessing netmask %pI4\n", &ic_netmask);
@@ -560,7 +550,6 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
560 if (ic_myaddr == NONE) 550 if (ic_myaddr == NONE)
561 ic_myaddr = tip; 551 ic_myaddr = tip;
562 ic_servaddr = sip; 552 ic_servaddr = sip;
563 ic_addrservaddr = sip;
564 ic_got_reply = IC_RARP; 553 ic_got_reply = IC_RARP;
565 554
566drop_unlock: 555drop_unlock:
@@ -586,17 +575,6 @@ static void __init ic_rarp_send_if(struct ic_device *d)
586#endif 575#endif
587 576
588/* 577/*
589 * Predefine Nameservers
590 */
591static inline void __init ic_nameservers_predef(void)
592{
593 int i;
594
595 for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
596 ic_nameservers[i] = NONE;
597}
598
599/*
600 * DHCP/BOOTP support. 578 * DHCP/BOOTP support.
601 */ 579 */
602 580
@@ -709,8 +687,8 @@ ic_dhcp_init_options(u8 *options)
709 e += len; 687 e += len;
710 } 688 }
711 if (*vendor_class_identifier) { 689 if (*vendor_class_identifier) {
712 pr_info("DHCP: sending class identifier \"%s\"\n", 690 printk(KERN_INFO "DHCP: sending class identifier \"%s\"\n",
713 vendor_class_identifier); 691 vendor_class_identifier);
714 *e++ = 60; /* Class-identifier */ 692 *e++ = 60; /* Class-identifier */
715 len = strlen(vendor_class_identifier); 693 len = strlen(vendor_class_identifier);
716 *e++ = len; 694 *e++ = len;
@@ -761,7 +739,10 @@ static void __init ic_bootp_init_ext(u8 *e)
761 */ 739 */
762static inline void __init ic_bootp_init(void) 740static inline void __init ic_bootp_init(void)
763{ 741{
764 ic_nameservers_predef(); 742 int i;
743
744 for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
745 ic_nameservers[i] = NONE;
765 746
766 dev_add_pack(&bootp_packet_type); 747 dev_add_pack(&bootp_packet_type);
767} 748}
@@ -785,15 +766,13 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d
785 struct sk_buff *skb; 766 struct sk_buff *skb;
786 struct bootp_pkt *b; 767 struct bootp_pkt *b;
787 struct iphdr *h; 768 struct iphdr *h;
788 int hlen = LL_RESERVED_SPACE(dev);
789 int tlen = dev->needed_tailroom;
790 769
791 /* Allocate packet */ 770 /* Allocate packet */
792 skb = alloc_skb(sizeof(struct bootp_pkt) + hlen + tlen + 15, 771 skb = alloc_skb(sizeof(struct bootp_pkt) + LL_ALLOCATED_SPACE(dev) + 15,
793 GFP_KERNEL); 772 GFP_KERNEL);
794 if (!skb) 773 if (!skb)
795 return; 774 return;
796 skb_reserve(skb, hlen); 775 skb_reserve(skb, LL_RESERVED_SPACE(dev));
797 b = (struct bootp_pkt *) skb_put(skb, sizeof(struct bootp_pkt)); 776 b = (struct bootp_pkt *) skb_put(skb, sizeof(struct bootp_pkt));
798 memset(b, 0, sizeof(struct bootp_pkt)); 777 memset(b, 0, sizeof(struct bootp_pkt));
799 778
@@ -819,6 +798,8 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d
819 b->op = BOOTP_REQUEST; 798 b->op = BOOTP_REQUEST;
820 if (dev->type < 256) /* check for false types */ 799 if (dev->type < 256) /* check for false types */
821 b->htype = dev->type; 800 b->htype = dev->type;
801 else if (dev->type == ARPHRD_IEEE802_TR) /* fix for token ring */
802 b->htype = ARPHRD_IEEE802;
822 else if (dev->type == ARPHRD_FDDI) 803 else if (dev->type == ARPHRD_FDDI)
823 b->htype = ARPHRD_ETHER; 804 b->htype = ARPHRD_ETHER;
824 else { 805 else {
@@ -844,13 +825,8 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d
844 skb->dev = dev; 825 skb->dev = dev;
845 skb->protocol = htons(ETH_P_IP); 826 skb->protocol = htons(ETH_P_IP);
846 if (dev_hard_header(skb, dev, ntohs(skb->protocol), 827 if (dev_hard_header(skb, dev, ntohs(skb->protocol),
847 dev->broadcast, dev->dev_addr, skb->len) < 0) { 828 dev->broadcast, dev->dev_addr, skb->len) < 0 ||
848 kfree_skb(skb); 829 dev_queue_xmit(skb) < 0)
849 printk("E");
850 return;
851 }
852
853 if (dev_queue_xmit(skb) < 0)
854 printk("E"); 830 printk("E");
855} 831}
856 832
@@ -875,9 +851,9 @@ static int __init ic_bootp_string(char *dest, char *src, int len, int max)
875 */ 851 */
876static void __init ic_do_bootp_ext(u8 *ext) 852static void __init ic_do_bootp_ext(u8 *ext)
877{ 853{
878 u8 servers; 854 u8 servers;
879 int i; 855 int i;
880 __be16 mtu; 856 u16 mtu;
881 857
882#ifdef IPCONFIG_DEBUG 858#ifdef IPCONFIG_DEBUG
883 u8 *c; 859 u8 *c;
@@ -964,7 +940,9 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
964 940
965 /* Fragments are not supported */ 941 /* Fragments are not supported */
966 if (ip_is_fragment(h)) { 942 if (ip_is_fragment(h)) {
967 net_err_ratelimited("DHCP/BOOTP: Ignoring fragmented reply\n"); 943 if (net_ratelimit())
944 printk(KERN_ERR "DHCP/BOOTP: Ignoring fragmented "
945 "reply.\n");
968 goto drop; 946 goto drop;
969 } 947 }
970 948
@@ -1012,14 +990,17 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
1012 /* Is it a reply to our BOOTP request? */ 990 /* Is it a reply to our BOOTP request? */
1013 if (b->op != BOOTP_REPLY || 991 if (b->op != BOOTP_REPLY ||
1014 b->xid != d->xid) { 992 b->xid != d->xid) {
1015 net_err_ratelimited("DHCP/BOOTP: Reply not for us, op[%x] xid[%x]\n", 993 if (net_ratelimit())
1016 b->op, b->xid); 994 printk(KERN_ERR "DHCP/BOOTP: Reply not for us, "
995 "op[%x] xid[%x]\n",
996 b->op, b->xid);
1017 goto drop_unlock; 997 goto drop_unlock;
1018 } 998 }
1019 999
1020 /* Is it a reply for the device we are configuring? */ 1000 /* Is it a reply for the device we are configuring? */
1021 if (b->xid != ic_dev_xid) { 1001 if (b->xid != ic_dev_xid) {
1022 net_err_ratelimited("DHCP/BOOTP: Ignoring delayed packet\n"); 1002 if (net_ratelimit())
1003 printk(KERN_ERR "DHCP/BOOTP: Ignoring delayed packet\n");
1023 goto drop_unlock; 1004 goto drop_unlock;
1024 } 1005 }
1025 1006
@@ -1071,7 +1052,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
1071 ic_servaddr = server_id; 1052 ic_servaddr = server_id;
1072#ifdef IPCONFIG_DEBUG 1053#ifdef IPCONFIG_DEBUG
1073 printk("DHCP: Offered address %pI4 by server %pI4\n", 1054 printk("DHCP: Offered address %pI4 by server %pI4\n",
1074 &ic_myaddr, &b->iph.saddr); 1055 &ic_myaddr, &ic_servaddr);
1075#endif 1056#endif
1076 /* The DHCP indicated server address takes 1057 /* The DHCP indicated server address takes
1077 * precedence over the bootp header one if 1058 * precedence over the bootp header one if
@@ -1116,7 +1097,6 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
1116 ic_dev = dev; 1097 ic_dev = dev;
1117 ic_myaddr = b->your_ip; 1098 ic_myaddr = b->your_ip;
1118 ic_servaddr = b->server_ip; 1099 ic_servaddr = b->server_ip;
1119 ic_addrservaddr = b->iph.saddr;
1120 if (ic_gateway == NONE && b->relay_ip) 1100 if (ic_gateway == NONE && b->relay_ip)
1121 ic_gateway = b->relay_ip; 1101 ic_gateway = b->relay_ip;
1122 if (ic_nameservers[0] == NONE) 1102 if (ic_nameservers[0] == NONE)
@@ -1158,17 +1138,17 @@ static int __init ic_dynamic(void)
1158 * are missing, and without DHCP/BOOTP/RARP we are unable to get it. 1138 * are missing, and without DHCP/BOOTP/RARP we are unable to get it.
1159 */ 1139 */
1160 if (!ic_proto_enabled) { 1140 if (!ic_proto_enabled) {
1161 pr_err("IP-Config: Incomplete network configuration information\n"); 1141 printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n");
1162 return -1; 1142 return -1;
1163 } 1143 }
1164 1144
1165#ifdef IPCONFIG_BOOTP 1145#ifdef IPCONFIG_BOOTP
1166 if ((ic_proto_enabled ^ ic_proto_have_if) & IC_BOOTP) 1146 if ((ic_proto_enabled ^ ic_proto_have_if) & IC_BOOTP)
1167 pr_err("DHCP/BOOTP: No suitable device found\n"); 1147 printk(KERN_ERR "DHCP/BOOTP: No suitable device found.\n");
1168#endif 1148#endif
1169#ifdef IPCONFIG_RARP 1149#ifdef IPCONFIG_RARP
1170 if ((ic_proto_enabled ^ ic_proto_have_if) & IC_RARP) 1150 if ((ic_proto_enabled ^ ic_proto_have_if) & IC_RARP)
1171 pr_err("RARP: No suitable device found\n"); 1151 printk(KERN_ERR "RARP: No suitable device found.\n");
1172#endif 1152#endif
1173 1153
1174 if (!ic_proto_have_if) 1154 if (!ic_proto_have_if)
@@ -1195,17 +1175,17 @@ static int __init ic_dynamic(void)
1195 * [Actually we could now, but the nothing else running note still 1175 * [Actually we could now, but the nothing else running note still
1196 * applies.. - AC] 1176 * applies.. - AC]
1197 */ 1177 */
1198 pr_notice("Sending %s%s%s requests .", 1178 printk(KERN_NOTICE "Sending %s%s%s requests .",
1199 do_bootp 1179 do_bootp
1200 ? ((ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP") : "", 1180 ? ((ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP") : "",
1201 (do_bootp && do_rarp) ? " and " : "", 1181 (do_bootp && do_rarp) ? " and " : "",
1202 do_rarp ? "RARP" : ""); 1182 do_rarp ? "RARP" : "");
1203 1183
1204 start_jiffies = jiffies; 1184 start_jiffies = jiffies;
1205 d = ic_first_dev; 1185 d = ic_first_dev;
1206 retries = CONF_SEND_RETRIES; 1186 retries = CONF_SEND_RETRIES;
1207 get_random_bytes(&timeout, sizeof(timeout)); 1187 get_random_bytes(&timeout, sizeof(timeout));
1208 timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned int) CONF_TIMEOUT_RANDOM); 1188 timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM);
1209 for (;;) { 1189 for (;;) {
1210 /* Track the device we are configuring */ 1190 /* Track the device we are configuring */
1211 ic_dev_xid = d->xid; 1191 ic_dev_xid = d->xid;
@@ -1228,13 +1208,13 @@ static int __init ic_dynamic(void)
1228 (ic_proto_enabled & IC_USE_DHCP) && 1208 (ic_proto_enabled & IC_USE_DHCP) &&
1229 ic_dhcp_msgtype != DHCPACK) { 1209 ic_dhcp_msgtype != DHCPACK) {
1230 ic_got_reply = 0; 1210 ic_got_reply = 0;
1231 pr_cont(","); 1211 printk(KERN_CONT ",");
1232 continue; 1212 continue;
1233 } 1213 }
1234#endif /* IPCONFIG_DHCP */ 1214#endif /* IPCONFIG_DHCP */
1235 1215
1236 if (ic_got_reply) { 1216 if (ic_got_reply) {
1237 pr_cont(" OK\n"); 1217 printk(KERN_CONT " OK\n");
1238 break; 1218 break;
1239 } 1219 }
1240 1220
@@ -1242,7 +1222,7 @@ static int __init ic_dynamic(void)
1242 continue; 1222 continue;
1243 1223
1244 if (! --retries) { 1224 if (! --retries) {
1245 pr_cont(" timed out!\n"); 1225 printk(KERN_CONT " timed out!\n");
1246 break; 1226 break;
1247 } 1227 }
1248 1228
@@ -1252,7 +1232,7 @@ static int __init ic_dynamic(void)
1252 if (timeout > CONF_TIMEOUT_MAX) 1232 if (timeout > CONF_TIMEOUT_MAX)
1253 timeout = CONF_TIMEOUT_MAX; 1233 timeout = CONF_TIMEOUT_MAX;
1254 1234
1255 pr_cont("."); 1235 printk(KERN_CONT ".");
1256 } 1236 }
1257 1237
1258#ifdef IPCONFIG_BOOTP 1238#ifdef IPCONFIG_BOOTP
@@ -1272,8 +1252,8 @@ static int __init ic_dynamic(void)
1272 printk("IP-Config: Got %s answer from %pI4, ", 1252 printk("IP-Config: Got %s answer from %pI4, ",
1273 ((ic_got_reply & IC_RARP) ? "RARP" 1253 ((ic_got_reply & IC_RARP) ? "RARP"
1274 : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"), 1254 : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"),
1275 &ic_addrservaddr); 1255 &ic_servaddr);
1276 pr_cont("my address is %pI4\n", &ic_myaddr); 1256 printk(KERN_CONT "my address is %pI4\n", &ic_myaddr);
1277 1257
1278 return 0; 1258 return 0;
1279} 1259}
@@ -1391,7 +1371,6 @@ static int __init ip_auto_config(void)
1391 int retries = CONF_OPEN_RETRIES; 1371 int retries = CONF_OPEN_RETRIES;
1392#endif 1372#endif
1393 int err; 1373 int err;
1394 unsigned int i;
1395 1374
1396#ifdef CONFIG_PROC_FS 1375#ifdef CONFIG_PROC_FS
1397 proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops); 1376 proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops);
@@ -1450,22 +1429,24 @@ static int __init ip_auto_config(void)
1450 */ 1429 */
1451#ifdef CONFIG_ROOT_NFS 1430#ifdef CONFIG_ROOT_NFS
1452 if (ROOT_DEV == Root_NFS) { 1431 if (ROOT_DEV == Root_NFS) {
1453 pr_err("IP-Config: Retrying forever (NFS root)...\n"); 1432 printk(KERN_ERR
1433 "IP-Config: Retrying forever (NFS root)...\n");
1454 goto try_try_again; 1434 goto try_try_again;
1455 } 1435 }
1456#endif 1436#endif
1457 1437
1458 if (--retries) { 1438 if (--retries) {
1459 pr_err("IP-Config: Reopening network devices...\n"); 1439 printk(KERN_ERR
1440 "IP-Config: Reopening network devices...\n");
1460 goto try_try_again; 1441 goto try_try_again;
1461 } 1442 }
1462 1443
1463 /* Oh, well. At least we tried. */ 1444 /* Oh, well. At least we tried. */
1464 pr_err("IP-Config: Auto-configuration of network failed\n"); 1445 printk(KERN_ERR "IP-Config: Auto-configuration of network failed.\n");
1465 return -1; 1446 return -1;
1466 } 1447 }
1467#else /* !DYNAMIC */ 1448#else /* !DYNAMIC */
1468 pr_err("IP-Config: Incomplete network configuration information\n"); 1449 printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n");
1469 ic_close_devs(); 1450 ic_close_devs();
1470 return -1; 1451 return -1;
1471#endif /* IPCONFIG_DYNAMIC */ 1452#endif /* IPCONFIG_DYNAMIC */
@@ -1503,26 +1484,19 @@ static int __init ip_auto_config(void)
1503 /* 1484 /*
1504 * Clue in the operator. 1485 * Clue in the operator.
1505 */ 1486 */
1506 pr_info("IP-Config: Complete:\n"); 1487 printk("IP-Config: Complete:\n");
1507 1488 printk(" device=%s", ic_dev->name);
1508 pr_info(" device=%s, hwaddr=%*phC, ipaddr=%pI4, mask=%pI4, gw=%pI4\n", 1489 printk(KERN_CONT ", addr=%pI4", &ic_myaddr);
1509 ic_dev->name, ic_dev->addr_len, ic_dev->dev_addr, 1490 printk(KERN_CONT ", mask=%pI4", &ic_netmask);
1510 &ic_myaddr, &ic_netmask, &ic_gateway); 1491 printk(KERN_CONT ", gw=%pI4", &ic_gateway);
1511 pr_info(" host=%s, domain=%s, nis-domain=%s\n", 1492 printk(KERN_CONT ",\n host=%s, domain=%s, nis-domain=%s",
1512 utsname()->nodename, ic_domain, utsname()->domainname); 1493 utsname()->nodename, ic_domain, utsname()->domainname);
1513 pr_info(" bootserver=%pI4, rootserver=%pI4, rootpath=%s", 1494 printk(KERN_CONT ",\n bootserver=%pI4", &ic_servaddr);
1514 &ic_servaddr, &root_server_addr, root_server_path); 1495 printk(KERN_CONT ", rootserver=%pI4", &root_server_addr);
1496 printk(KERN_CONT ", rootpath=%s", root_server_path);
1515 if (ic_dev_mtu) 1497 if (ic_dev_mtu)
1516 pr_cont(", mtu=%d", ic_dev_mtu); 1498 printk(KERN_CONT ", mtu=%d", ic_dev_mtu);
1517 for (i = 0; i < CONF_NAMESERVERS_MAX; i++) 1499 printk(KERN_CONT "\n");
1518 if (ic_nameservers[i] != NONE) {
1519 pr_info(" nameserver%u=%pI4",
1520 i, &ic_nameservers[i]);
1521 break;
1522 }
1523 for (i++; i < CONF_NAMESERVERS_MAX; i++)
1524 if (ic_nameservers[i] != NONE)
1525 pr_cont(", nameserver%u=%pI4\n", i, &ic_nameservers[i]);
1526#endif /* !SILENT */ 1500#endif /* !SILENT */
1527 1501
1528 return 0; 1502 return 0;
@@ -1593,8 +1567,6 @@ static int __init ip_auto_config_setup(char *addrs)
1593 return 1; 1567 return 1;
1594 } 1568 }
1595 1569
1596 ic_nameservers_predef();
1597
1598 /* Parse string for static IP assignment. */ 1570 /* Parse string for static IP assignment. */
1599 ip = addrs; 1571 ip = addrs;
1600 while (ip && *ip) { 1572 while (ip && *ip) {
@@ -1638,20 +1610,6 @@ static int __init ip_auto_config_setup(char *addrs)
1638 ic_enable = 0; 1610 ic_enable = 0;
1639 } 1611 }
1640 break; 1612 break;
1641 case 7:
1642 if (CONF_NAMESERVERS_MAX >= 1) {
1643 ic_nameservers[0] = in_aton(ip);
1644 if (ic_nameservers[0] == ANY)
1645 ic_nameservers[0] = NONE;
1646 }
1647 break;
1648 case 8:
1649 if (CONF_NAMESERVERS_MAX >= 2) {
1650 ic_nameservers[1] = in_aton(ip);
1651 if (ic_nameservers[1] == ANY)
1652 ic_nameservers[1] = NONE;
1653 }
1654 break;
1655 } 1613 }
1656 } 1614 }
1657 ip = cp; 1615 ip = cp;
@@ -1660,21 +1618,22 @@ static int __init ip_auto_config_setup(char *addrs)
1660 1618
1661 return 1; 1619 return 1;
1662} 1620}
1663__setup("ip=", ip_auto_config_setup);
1664 1621
1665static int __init nfsaddrs_config_setup(char *addrs) 1622static int __init nfsaddrs_config_setup(char *addrs)
1666{ 1623{
1667 return ip_auto_config_setup(addrs); 1624 return ip_auto_config_setup(addrs);
1668} 1625}
1669__setup("nfsaddrs=", nfsaddrs_config_setup);
1670 1626
1671static int __init vendor_class_identifier_setup(char *addrs) 1627static int __init vendor_class_identifier_setup(char *addrs)
1672{ 1628{
1673 if (strlcpy(vendor_class_identifier, addrs, 1629 if (strlcpy(vendor_class_identifier, addrs,
1674 sizeof(vendor_class_identifier)) 1630 sizeof(vendor_class_identifier))
1675 >= sizeof(vendor_class_identifier)) 1631 >= sizeof(vendor_class_identifier))
1676 pr_warn("DHCP: vendorclass too long, truncated to \"%s\"", 1632 printk(KERN_WARNING "DHCP: vendorclass too long, truncated to \"%s\"",
1677 vendor_class_identifier); 1633 vendor_class_identifier);
1678 return 1; 1634 return 1;
1679} 1635}
1636
1637__setup("ip=", ip_auto_config_setup);
1638__setup("nfsaddrs=", nfsaddrs_config_setup);
1680__setup("dhcpclass=", vendor_class_identifier_setup); 1639__setup("dhcpclass=", vendor_class_identifier_setup);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 191fc24a745..6f06f7f39ea 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -120,10 +120,6 @@
120#define HASH_SIZE 16 120#define HASH_SIZE 16
121#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) 121#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
122 122
123static bool log_ecn_error = true;
124module_param(log_ecn_error, bool, 0644);
125MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
126
127static int ipip_net_id __read_mostly; 123static int ipip_net_id __read_mostly;
128struct ipip_net { 124struct ipip_net {
129 struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE]; 125 struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
@@ -138,43 +134,43 @@ struct ipip_net {
138static int ipip_tunnel_init(struct net_device *dev); 134static int ipip_tunnel_init(struct net_device *dev);
139static void ipip_tunnel_setup(struct net_device *dev); 135static void ipip_tunnel_setup(struct net_device *dev);
140static void ipip_dev_free(struct net_device *dev); 136static void ipip_dev_free(struct net_device *dev);
141static struct rtnl_link_ops ipip_link_ops __read_mostly;
142 137
143static struct rtnl_link_stats64 *ipip_get_stats64(struct net_device *dev, 138/*
144 struct rtnl_link_stats64 *tot) 139 * Locking : hash tables are protected by RCU and RTNL
140 */
141
142#define for_each_ip_tunnel_rcu(start) \
143 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
144
145/* often modified stats are per cpu, other are shared (netdev->stats) */
146struct pcpu_tstats {
147 unsigned long rx_packets;
148 unsigned long rx_bytes;
149 unsigned long tx_packets;
150 unsigned long tx_bytes;
151};
152
153static struct net_device_stats *ipip_get_stats(struct net_device *dev)
145{ 154{
155 struct pcpu_tstats sum = { 0 };
146 int i; 156 int i;
147 157
148 for_each_possible_cpu(i) { 158 for_each_possible_cpu(i) {
149 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); 159 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
150 u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
151 unsigned int start;
152
153 do {
154 start = u64_stats_fetch_begin_bh(&tstats->syncp);
155 rx_packets = tstats->rx_packets;
156 tx_packets = tstats->tx_packets;
157 rx_bytes = tstats->rx_bytes;
158 tx_bytes = tstats->tx_bytes;
159 } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
160
161 tot->rx_packets += rx_packets;
162 tot->tx_packets += tx_packets;
163 tot->rx_bytes += rx_bytes;
164 tot->tx_bytes += tx_bytes;
165 }
166
167 tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
168 tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
169 tot->tx_dropped = dev->stats.tx_dropped;
170 tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
171 tot->tx_errors = dev->stats.tx_errors;
172 tot->collisions = dev->stats.collisions;
173 160
174 return tot; 161 sum.rx_packets += tstats->rx_packets;
162 sum.rx_bytes += tstats->rx_bytes;
163 sum.tx_packets += tstats->tx_packets;
164 sum.tx_bytes += tstats->tx_bytes;
165 }
166 dev->stats.rx_packets = sum.rx_packets;
167 dev->stats.rx_bytes = sum.rx_bytes;
168 dev->stats.tx_packets = sum.tx_packets;
169 dev->stats.tx_bytes = sum.tx_bytes;
170 return &dev->stats;
175} 171}
176 172
177static struct ip_tunnel *ipip_tunnel_lookup(struct net *net, 173static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
178 __be32 remote, __be32 local) 174 __be32 remote, __be32 local)
179{ 175{
180 unsigned int h0 = HASH(remote); 176 unsigned int h0 = HASH(remote);
@@ -182,16 +178,16 @@ static struct ip_tunnel *ipip_tunnel_lookup(struct net *net,
182 struct ip_tunnel *t; 178 struct ip_tunnel *t;
183 struct ipip_net *ipn = net_generic(net, ipip_net_id); 179 struct ipip_net *ipn = net_generic(net, ipip_net_id);
184 180
185 for_each_ip_tunnel_rcu(t, ipn->tunnels_r_l[h0 ^ h1]) 181 for_each_ip_tunnel_rcu(ipn->tunnels_r_l[h0 ^ h1])
186 if (local == t->parms.iph.saddr && 182 if (local == t->parms.iph.saddr &&
187 remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) 183 remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
188 return t; 184 return t;
189 185
190 for_each_ip_tunnel_rcu(t, ipn->tunnels_r[h0]) 186 for_each_ip_tunnel_rcu(ipn->tunnels_r[h0])
191 if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) 187 if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
192 return t; 188 return t;
193 189
194 for_each_ip_tunnel_rcu(t, ipn->tunnels_l[h1]) 190 for_each_ip_tunnel_rcu(ipn->tunnels_l[h1])
195 if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) 191 if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
196 return t; 192 return t;
197 193
@@ -249,33 +245,7 @@ static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
249 rcu_assign_pointer(*tp, t); 245 rcu_assign_pointer(*tp, t);
250} 246}
251 247
252static int ipip_tunnel_create(struct net_device *dev) 248static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
253{
254 struct ip_tunnel *t = netdev_priv(dev);
255 struct net *net = dev_net(dev);
256 struct ipip_net *ipn = net_generic(net, ipip_net_id);
257 int err;
258
259 err = ipip_tunnel_init(dev);
260 if (err < 0)
261 goto out;
262
263 err = register_netdevice(dev);
264 if (err < 0)
265 goto out;
266
267 strcpy(t->parms.name, dev->name);
268 dev->rtnl_link_ops = &ipip_link_ops;
269
270 dev_hold(dev);
271 ipip_tunnel_link(ipn, t);
272 return 0;
273
274out:
275 return err;
276}
277
278static struct ip_tunnel *ipip_tunnel_locate(struct net *net,
279 struct ip_tunnel_parm *parms, int create) 249 struct ip_tunnel_parm *parms, int create)
280{ 250{
281 __be32 remote = parms->iph.daddr; 251 __be32 remote = parms->iph.daddr;
@@ -309,9 +279,16 @@ static struct ip_tunnel *ipip_tunnel_locate(struct net *net,
309 nt = netdev_priv(dev); 279 nt = netdev_priv(dev);
310 nt->parms = *parms; 280 nt->parms = *parms;
311 281
312 if (ipip_tunnel_create(dev) < 0) 282 if (ipip_tunnel_init(dev) < 0)
283 goto failed_free;
284
285 if (register_netdevice(dev) < 0)
313 goto failed_free; 286 goto failed_free;
314 287
288 strcpy(nt->parms.name, dev->name);
289
290 dev_hold(dev);
291 ipip_tunnel_link(ipn, nt);
315 return nt; 292 return nt;
316 293
317failed_free: 294failed_free:
@@ -326,7 +303,7 @@ static void ipip_tunnel_uninit(struct net_device *dev)
326 struct ipip_net *ipn = net_generic(net, ipip_net_id); 303 struct ipip_net *ipn = net_generic(net, ipip_net_id);
327 304
328 if (dev == ipn->fb_tunnel_dev) 305 if (dev == ipn->fb_tunnel_dev)
329 RCU_INIT_POINTER(ipn->tunnels_wc[0], NULL); 306 rcu_assign_pointer(ipn->tunnels_wc[0], NULL);
330 else 307 else
331 ipip_tunnel_unlink(ipn, netdev_priv(dev)); 308 ipip_tunnel_unlink(ipn, netdev_priv(dev));
332 dev_put(dev); 309 dev_put(dev);
@@ -356,6 +333,9 @@ static int ipip_err(struct sk_buff *skb, u32 info)
356 case ICMP_PORT_UNREACH: 333 case ICMP_PORT_UNREACH:
357 /* Impossible event. */ 334 /* Impossible event. */
358 return 0; 335 return 0;
336 case ICMP_FRAG_NEEDED:
337 /* Soft state for pmtu is maintained by IP core. */
338 return 0;
359 default: 339 default:
360 /* All others are translated to HOST_UNREACH. 340 /* All others are translated to HOST_UNREACH.
361 rfc2003 contains "deep thoughts" about NET_UNREACH, 341 rfc2003 contains "deep thoughts" about NET_UNREACH,
@@ -368,30 +348,13 @@ static int ipip_err(struct sk_buff *skb, u32 info)
368 if (code != ICMP_EXC_TTL) 348 if (code != ICMP_EXC_TTL)
369 return 0; 349 return 0;
370 break; 350 break;
371 case ICMP_REDIRECT:
372 break;
373 } 351 }
374 352
375 err = -ENOENT; 353 err = -ENOENT;
376 t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
377 if (t == NULL)
378 goto out;
379 354
380 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { 355 rcu_read_lock();
381 ipv4_update_pmtu(skb, dev_net(skb->dev), info, 356 t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
382 t->dev->ifindex, 0, IPPROTO_IPIP, 0); 357 if (t == NULL || t->parms.iph.daddr == 0)
383 err = 0;
384 goto out;
385 }
386
387 if (type == ICMP_REDIRECT) {
388 ipv4_redirect(skb, dev_net(skb->dev), t->dev->ifindex, 0,
389 IPPROTO_IPIP, 0);
390 err = 0;
391 goto out;
392 }
393
394 if (t->parms.iph.daddr == 0)
395 goto out; 358 goto out;
396 359
397 err = 0; 360 err = 0;
@@ -404,22 +367,34 @@ static int ipip_err(struct sk_buff *skb, u32 info)
404 t->err_count = 1; 367 t->err_count = 1;
405 t->err_time = jiffies; 368 t->err_time = jiffies;
406out: 369out:
407 370 rcu_read_unlock();
408 return err; 371 return err;
409} 372}
410 373
374static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph,
375 struct sk_buff *skb)
376{
377 struct iphdr *inner_iph = ip_hdr(skb);
378
379 if (INET_ECN_is_ce(outer_iph->tos))
380 IP_ECN_set_ce(inner_iph);
381}
382
411static int ipip_rcv(struct sk_buff *skb) 383static int ipip_rcv(struct sk_buff *skb)
412{ 384{
413 struct ip_tunnel *tunnel; 385 struct ip_tunnel *tunnel;
414 const struct iphdr *iph = ip_hdr(skb); 386 const struct iphdr *iph = ip_hdr(skb);
415 int err;
416 387
388 rcu_read_lock();
417 tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr); 389 tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
418 if (tunnel != NULL) { 390 if (tunnel != NULL) {
419 struct pcpu_tstats *tstats; 391 struct pcpu_tstats *tstats;
420 392
421 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) 393 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
422 goto drop; 394 rcu_read_unlock();
395 kfree_skb(skb);
396 return 0;
397 }
423 398
424 secpath_reset(skb); 399 secpath_reset(skb);
425 400
@@ -428,35 +403,22 @@ static int ipip_rcv(struct sk_buff *skb)
428 skb->protocol = htons(ETH_P_IP); 403 skb->protocol = htons(ETH_P_IP);
429 skb->pkt_type = PACKET_HOST; 404 skb->pkt_type = PACKET_HOST;
430 405
431 __skb_tunnel_rx(skb, tunnel->dev);
432
433 err = IP_ECN_decapsulate(iph, skb);
434 if (unlikely(err)) {
435 if (log_ecn_error)
436 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
437 &iph->saddr, iph->tos);
438 if (err > 1) {
439 ++tunnel->dev->stats.rx_frame_errors;
440 ++tunnel->dev->stats.rx_errors;
441 goto drop;
442 }
443 }
444
445 tstats = this_cpu_ptr(tunnel->dev->tstats); 406 tstats = this_cpu_ptr(tunnel->dev->tstats);
446 u64_stats_update_begin(&tstats->syncp);
447 tstats->rx_packets++; 407 tstats->rx_packets++;
448 tstats->rx_bytes += skb->len; 408 tstats->rx_bytes += skb->len;
449 u64_stats_update_end(&tstats->syncp); 409
410 __skb_tunnel_rx(skb, tunnel->dev);
411
412 ipip_ecn_decapsulate(iph, skb);
450 413
451 netif_rx(skb); 414 netif_rx(skb);
415
416 rcu_read_unlock();
452 return 0; 417 return 0;
453 } 418 }
419 rcu_read_unlock();
454 420
455 return -1; 421 return -1;
456
457drop:
458 kfree_skb(skb);
459 return 0;
460} 422}
461 423
462/* 424/*
@@ -467,6 +429,7 @@ drop:
467static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) 429static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
468{ 430{
469 struct ip_tunnel *tunnel = netdev_priv(dev); 431 struct ip_tunnel *tunnel = netdev_priv(dev);
432 struct pcpu_tstats *tstats;
470 const struct iphdr *tiph = &tunnel->parms.iph; 433 const struct iphdr *tiph = &tunnel->parms.iph;
471 u8 tos = tunnel->parms.iph.tos; 434 u8 tos = tunnel->parms.iph.tos;
472 __be16 df = tiph->frag_off; 435 __be16 df = tiph->frag_off;
@@ -482,10 +445,6 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
482 if (skb->protocol != htons(ETH_P_IP)) 445 if (skb->protocol != htons(ETH_P_IP))
483 goto tx_error; 446 goto tx_error;
484 447
485 if (skb->ip_summed == CHECKSUM_PARTIAL &&
486 skb_checksum_help(skb))
487 goto tx_error;
488
489 if (tos & 1) 448 if (tos & 1)
490 tos = old_iph->tos; 449 tos = old_iph->tos;
491 450
@@ -495,7 +454,8 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
495 dev->stats.tx_fifo_errors++; 454 dev->stats.tx_fifo_errors++;
496 goto tx_error; 455 goto tx_error;
497 } 456 }
498 dst = rt_nexthop(rt, old_iph->daddr); 457 if ((dst = rt->rt_gateway) == 0)
458 goto tx_error_icmp;
499 } 459 }
500 460
501 rt = ip_route_output_ports(dev_net(dev), &fl4, NULL, 461 rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
@@ -527,7 +487,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
527 } 487 }
528 488
529 if (skb_dst(skb)) 489 if (skb_dst(skb))
530 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); 490 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
531 491
532 if ((old_iph->frag_off & htons(IP_DF)) && 492 if ((old_iph->frag_off & htons(IP_DF)) &&
533 mtu < ntohs(old_iph->tot_len)) { 493 mtu < ntohs(old_iph->tot_len)) {
@@ -593,7 +553,9 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
593 if ((iph->ttl = tiph->ttl) == 0) 553 if ((iph->ttl = tiph->ttl) == 0)
594 iph->ttl = old_iph->ttl; 554 iph->ttl = old_iph->ttl;
595 555
596 iptunnel_xmit(skb, dev); 556 nf_reset(skb);
557 tstats = this_cpu_ptr(dev->tstats);
558 __IPTUNNEL_XMIT(tstats, &dev->stats);
597 return NETDEV_TX_OK; 559 return NETDEV_TX_OK;
598 560
599tx_error_icmp: 561tx_error_icmp:
@@ -640,28 +602,6 @@ static void ipip_tunnel_bind_dev(struct net_device *dev)
640 dev->iflink = tunnel->parms.link; 602 dev->iflink = tunnel->parms.link;
641} 603}
642 604
643static void ipip_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p)
644{
645 struct net *net = dev_net(t->dev);
646 struct ipip_net *ipn = net_generic(net, ipip_net_id);
647
648 ipip_tunnel_unlink(ipn, t);
649 synchronize_net();
650 t->parms.iph.saddr = p->iph.saddr;
651 t->parms.iph.daddr = p->iph.daddr;
652 memcpy(t->dev->dev_addr, &p->iph.saddr, 4);
653 memcpy(t->dev->broadcast, &p->iph.daddr, 4);
654 ipip_tunnel_link(ipn, t);
655 t->parms.iph.ttl = p->iph.ttl;
656 t->parms.iph.tos = p->iph.tos;
657 t->parms.iph.frag_off = p->iph.frag_off;
658 if (t->parms.link != p->link) {
659 t->parms.link = p->link;
660 ipip_tunnel_bind_dev(t->dev);
661 }
662 netdev_state_change(t->dev);
663}
664
665static int 605static int
666ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) 606ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
667{ 607{
@@ -691,7 +631,7 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
691 case SIOCADDTUNNEL: 631 case SIOCADDTUNNEL:
692 case SIOCCHGTUNNEL: 632 case SIOCCHGTUNNEL:
693 err = -EPERM; 633 err = -EPERM;
694 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 634 if (!capable(CAP_NET_ADMIN))
695 goto done; 635 goto done;
696 636
697 err = -EFAULT; 637 err = -EFAULT;
@@ -720,13 +660,29 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
720 break; 660 break;
721 } 661 }
722 t = netdev_priv(dev); 662 t = netdev_priv(dev);
663 ipip_tunnel_unlink(ipn, t);
664 synchronize_net();
665 t->parms.iph.saddr = p.iph.saddr;
666 t->parms.iph.daddr = p.iph.daddr;
667 memcpy(dev->dev_addr, &p.iph.saddr, 4);
668 memcpy(dev->broadcast, &p.iph.daddr, 4);
669 ipip_tunnel_link(ipn, t);
670 netdev_state_change(dev);
723 } 671 }
724
725 ipip_tunnel_update(t, &p);
726 } 672 }
727 673
728 if (t) { 674 if (t) {
729 err = 0; 675 err = 0;
676 if (cmd == SIOCCHGTUNNEL) {
677 t->parms.iph.ttl = p.iph.ttl;
678 t->parms.iph.tos = p.iph.tos;
679 t->parms.iph.frag_off = p.iph.frag_off;
680 if (t->parms.link != p.link) {
681 t->parms.link = p.link;
682 ipip_tunnel_bind_dev(dev);
683 netdev_state_change(dev);
684 }
685 }
730 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) 686 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
731 err = -EFAULT; 687 err = -EFAULT;
732 } else 688 } else
@@ -735,7 +691,7 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
735 691
736 case SIOCDELTUNNEL: 692 case SIOCDELTUNNEL:
737 err = -EPERM; 693 err = -EPERM;
738 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 694 if (!capable(CAP_NET_ADMIN))
739 goto done; 695 goto done;
740 696
741 if (dev == ipn->fb_tunnel_dev) { 697 if (dev == ipn->fb_tunnel_dev) {
@@ -775,7 +731,7 @@ static const struct net_device_ops ipip_netdev_ops = {
775 .ndo_start_xmit = ipip_tunnel_xmit, 731 .ndo_start_xmit = ipip_tunnel_xmit,
776 .ndo_do_ioctl = ipip_tunnel_ioctl, 732 .ndo_do_ioctl = ipip_tunnel_ioctl,
777 .ndo_change_mtu = ipip_tunnel_change_mtu, 733 .ndo_change_mtu = ipip_tunnel_change_mtu,
778 .ndo_get_stats64 = ipip_get_stats64, 734 .ndo_get_stats = ipip_get_stats,
779}; 735};
780 736
781static void ipip_dev_free(struct net_device *dev) 737static void ipip_dev_free(struct net_device *dev)
@@ -784,11 +740,6 @@ static void ipip_dev_free(struct net_device *dev)
784 free_netdev(dev); 740 free_netdev(dev);
785} 741}
786 742
787#define IPIP_FEATURES (NETIF_F_SG | \
788 NETIF_F_FRAGLIST | \
789 NETIF_F_HIGHDMA | \
790 NETIF_F_HW_CSUM)
791
792static void ipip_tunnel_setup(struct net_device *dev) 743static void ipip_tunnel_setup(struct net_device *dev)
793{ 744{
794 dev->netdev_ops = &ipip_netdev_ops; 745 dev->netdev_ops = &ipip_netdev_ops;
@@ -803,9 +754,6 @@ static void ipip_tunnel_setup(struct net_device *dev)
803 dev->features |= NETIF_F_NETNS_LOCAL; 754 dev->features |= NETIF_F_NETNS_LOCAL;
804 dev->features |= NETIF_F_LLTX; 755 dev->features |= NETIF_F_LLTX;
805 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; 756 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
806
807 dev->features |= IPIP_FEATURES;
808 dev->hw_features |= IPIP_FEATURES;
809} 757}
810 758
811static int ipip_tunnel_init(struct net_device *dev) 759static int ipip_tunnel_init(struct net_device *dev)
@@ -848,142 +796,6 @@ static int __net_init ipip_fb_tunnel_init(struct net_device *dev)
848 return 0; 796 return 0;
849} 797}
850 798
851static void ipip_netlink_parms(struct nlattr *data[],
852 struct ip_tunnel_parm *parms)
853{
854 memset(parms, 0, sizeof(*parms));
855
856 parms->iph.version = 4;
857 parms->iph.protocol = IPPROTO_IPIP;
858 parms->iph.ihl = 5;
859
860 if (!data)
861 return;
862
863 if (data[IFLA_IPTUN_LINK])
864 parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]);
865
866 if (data[IFLA_IPTUN_LOCAL])
867 parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]);
868
869 if (data[IFLA_IPTUN_REMOTE])
870 parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]);
871
872 if (data[IFLA_IPTUN_TTL]) {
873 parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]);
874 if (parms->iph.ttl)
875 parms->iph.frag_off = htons(IP_DF);
876 }
877
878 if (data[IFLA_IPTUN_TOS])
879 parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]);
880
881 if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
882 parms->iph.frag_off = htons(IP_DF);
883}
884
885static int ipip_newlink(struct net *src_net, struct net_device *dev,
886 struct nlattr *tb[], struct nlattr *data[])
887{
888 struct net *net = dev_net(dev);
889 struct ip_tunnel *nt;
890
891 nt = netdev_priv(dev);
892 ipip_netlink_parms(data, &nt->parms);
893
894 if (ipip_tunnel_locate(net, &nt->parms, 0))
895 return -EEXIST;
896
897 return ipip_tunnel_create(dev);
898}
899
900static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
901 struct nlattr *data[])
902{
903 struct ip_tunnel *t;
904 struct ip_tunnel_parm p;
905 struct net *net = dev_net(dev);
906 struct ipip_net *ipn = net_generic(net, ipip_net_id);
907
908 if (dev == ipn->fb_tunnel_dev)
909 return -EINVAL;
910
911 ipip_netlink_parms(data, &p);
912
913 if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) ||
914 (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
915 return -EINVAL;
916
917 t = ipip_tunnel_locate(net, &p, 0);
918
919 if (t) {
920 if (t->dev != dev)
921 return -EEXIST;
922 } else
923 t = netdev_priv(dev);
924
925 ipip_tunnel_update(t, &p);
926 return 0;
927}
928
929static size_t ipip_get_size(const struct net_device *dev)
930{
931 return
932 /* IFLA_IPTUN_LINK */
933 nla_total_size(4) +
934 /* IFLA_IPTUN_LOCAL */
935 nla_total_size(4) +
936 /* IFLA_IPTUN_REMOTE */
937 nla_total_size(4) +
938 /* IFLA_IPTUN_TTL */
939 nla_total_size(1) +
940 /* IFLA_IPTUN_TOS */
941 nla_total_size(1) +
942 /* IFLA_IPTUN_PMTUDISC */
943 nla_total_size(1) +
944 0;
945}
946
947static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
948{
949 struct ip_tunnel *tunnel = netdev_priv(dev);
950 struct ip_tunnel_parm *parm = &tunnel->parms;
951
952 if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) ||
953 nla_put_be32(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) ||
954 nla_put_be32(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) ||
955 nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) ||
956 nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) ||
957 nla_put_u8(skb, IFLA_IPTUN_PMTUDISC,
958 !!(parm->iph.frag_off & htons(IP_DF))))
959 goto nla_put_failure;
960 return 0;
961
962nla_put_failure:
963 return -EMSGSIZE;
964}
965
966static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = {
967 [IFLA_IPTUN_LINK] = { .type = NLA_U32 },
968 [IFLA_IPTUN_LOCAL] = { .type = NLA_U32 },
969 [IFLA_IPTUN_REMOTE] = { .type = NLA_U32 },
970 [IFLA_IPTUN_TTL] = { .type = NLA_U8 },
971 [IFLA_IPTUN_TOS] = { .type = NLA_U8 },
972 [IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 },
973};
974
975static struct rtnl_link_ops ipip_link_ops __read_mostly = {
976 .kind = "ipip",
977 .maxtype = IFLA_IPTUN_MAX,
978 .policy = ipip_policy,
979 .priv_size = sizeof(struct ip_tunnel),
980 .setup = ipip_tunnel_setup,
981 .newlink = ipip_newlink,
982 .changelink = ipip_changelink,
983 .get_size = ipip_get_size,
984 .fill_info = ipip_fill_info,
985};
986
987static struct xfrm_tunnel ipip_handler __read_mostly = { 799static struct xfrm_tunnel ipip_handler __read_mostly = {
988 .handler = ipip_rcv, 800 .handler = ipip_rcv,
989 .err_handler = ipip_err, 801 .err_handler = ipip_err,
@@ -1080,28 +892,16 @@ static int __init ipip_init(void)
1080 return err; 892 return err;
1081 err = xfrm4_tunnel_register(&ipip_handler, AF_INET); 893 err = xfrm4_tunnel_register(&ipip_handler, AF_INET);
1082 if (err < 0) { 894 if (err < 0) {
1083 pr_info("%s: can't register tunnel\n", __func__); 895 unregister_pernet_device(&ipip_net_ops);
1084 goto xfrm_tunnel_failed; 896 printk(KERN_INFO "ipip init: can't register tunnel\n");
1085 } 897 }
1086 err = rtnl_link_register(&ipip_link_ops);
1087 if (err < 0)
1088 goto rtnl_link_failed;
1089
1090out:
1091 return err; 898 return err;
1092
1093rtnl_link_failed:
1094 xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
1095xfrm_tunnel_failed:
1096 unregister_pernet_device(&ipip_net_ops);
1097 goto out;
1098} 899}
1099 900
1100static void __exit ipip_fini(void) 901static void __exit ipip_fini(void)
1101{ 902{
1102 rtnl_link_unregister(&ipip_link_ops);
1103 if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET)) 903 if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
1104 pr_info("%s: can't deregister tunnel\n", __func__); 904 printk(KERN_INFO "ipip close: can't deregister tunnel\n");
1105 905
1106 unregister_pernet_device(&ipip_net_ops); 906 unregister_pernet_device(&ipip_net_ops);
1107} 907}
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index a9454cbd953..58e87915797 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -26,6 +26,7 @@
26 * 26 *
27 */ 27 */
28 28
29#include <asm/system.h>
29#include <asm/uaccess.h> 30#include <asm/uaccess.h>
30#include <linux/types.h> 31#include <linux/types.h>
31#include <linux/capability.h> 32#include <linux/capability.h>
@@ -60,12 +61,10 @@
60#include <linux/if_arp.h> 61#include <linux/if_arp.h>
61#include <linux/netfilter_ipv4.h> 62#include <linux/netfilter_ipv4.h>
62#include <linux/compat.h> 63#include <linux/compat.h>
63#include <linux/export.h>
64#include <net/ipip.h> 64#include <net/ipip.h>
65#include <net/checksum.h> 65#include <net/checksum.h>
66#include <net/netlink.h> 66#include <net/netlink.h>
67#include <net/fib_rules.h> 67#include <net/fib_rules.h>
68#include <linux/netconf.h>
69 68
70#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2) 69#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
71#define CONFIG_IP_PIMSM 1 70#define CONFIG_IP_PIMSM 1
@@ -84,8 +83,8 @@ struct mr_table {
84 struct vif_device vif_table[MAXVIFS]; 83 struct vif_device vif_table[MAXVIFS];
85 int maxvif; 84 int maxvif;
86 atomic_t cache_resolve_queue_len; 85 atomic_t cache_resolve_queue_len;
87 bool mroute_do_assert; 86 int mroute_do_assert;
88 bool mroute_do_pim; 87 int mroute_do_pim;
89#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2) 88#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
90 int mroute_reg_vif_num; 89 int mroute_reg_vif_num;
91#endif 90#endif
@@ -125,8 +124,6 @@ static DEFINE_SPINLOCK(mfc_unres_lock);
125static struct kmem_cache *mrt_cachep __read_mostly; 124static struct kmem_cache *mrt_cachep __read_mostly;
126 125
127static struct mr_table *ipmr_new_table(struct net *net, u32 id); 126static struct mr_table *ipmr_new_table(struct net *net, u32 id);
128static void ipmr_free_table(struct mr_table *mrt);
129
130static int ip_mr_forward(struct net *net, struct mr_table *mrt, 127static int ip_mr_forward(struct net *net, struct mr_table *mrt,
131 struct sk_buff *skb, struct mfc_cache *cache, 128 struct sk_buff *skb, struct mfc_cache *cache,
132 int local); 129 int local);
@@ -134,9 +131,6 @@ static int ipmr_cache_report(struct mr_table *mrt,
134 struct sk_buff *pkt, vifi_t vifi, int assert); 131 struct sk_buff *pkt, vifi_t vifi, int assert);
135static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, 132static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
136 struct mfc_cache *c, struct rtmsg *rtm); 133 struct mfc_cache *c, struct rtmsg *rtm);
137static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
138 int cmd);
139static void mroute_clean_tables(struct mr_table *mrt);
140static void ipmr_expire_process(unsigned long arg); 134static void ipmr_expire_process(unsigned long arg);
141 135
142#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES 136#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
@@ -224,7 +218,7 @@ static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
224 return 0; 218 return 0;
225} 219}
226 220
227static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = { 221static const struct fib_rules_ops __net_initdata ipmr_rules_ops_template = {
228 .family = RTNL_FAMILY_IPMR, 222 .family = RTNL_FAMILY_IPMR,
229 .rule_size = sizeof(struct ipmr_rule), 223 .rule_size = sizeof(struct ipmr_rule),
230 .addr_size = sizeof(u32), 224 .addr_size = sizeof(u32),
@@ -277,7 +271,7 @@ static void __net_exit ipmr_rules_exit(struct net *net)
277 271
278 list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) { 272 list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
279 list_del(&mrt->list); 273 list_del(&mrt->list);
280 ipmr_free_table(mrt); 274 kfree(mrt);
281 } 275 }
282 fib_rules_unregister(net->ipv4.mr_rules_ops); 276 fib_rules_unregister(net->ipv4.mr_rules_ops);
283} 277}
@@ -305,7 +299,7 @@ static int __net_init ipmr_rules_init(struct net *net)
305 299
306static void __net_exit ipmr_rules_exit(struct net *net) 300static void __net_exit ipmr_rules_exit(struct net *net)
307{ 301{
308 ipmr_free_table(net->ipv4.mrt); 302 kfree(net->ipv4.mrt);
309} 303}
310#endif 304#endif
311 305
@@ -342,13 +336,6 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
342 return mrt; 336 return mrt;
343} 337}
344 338
345static void ipmr_free_table(struct mr_table *mrt)
346{
347 del_timer_sync(&mrt->ipmr_expire_timer);
348 mroute_clean_tables(mrt);
349 kfree(mrt);
350}
351
352/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ 339/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
353 340
354static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v) 341static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
@@ -537,8 +524,8 @@ failure:
537} 524}
538#endif 525#endif
539 526
540/** 527/*
541 * vif_delete - Delete a VIF entry 528 * Delete a VIF entry
542 * @notify: Set to 1, if the caller is a notifier_call 529 * @notify: Set to 1, if the caller is a notifier_call
543 */ 530 */
544 531
@@ -585,9 +572,6 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
585 in_dev = __in_dev_get_rtnl(dev); 572 in_dev = __in_dev_get_rtnl(dev);
586 if (in_dev) { 573 if (in_dev) {
587 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--; 574 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
588 inet_netconf_notify_devconf(dev_net(dev),
589 NETCONFA_MC_FORWARDING,
590 dev->ifindex, &in_dev->cnf);
591 ip_rt_multicast_event(in_dev); 575 ip_rt_multicast_event(in_dev);
592 } 576 }
593 577
@@ -632,7 +616,7 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
632 e->error = -ETIMEDOUT; 616 e->error = -ETIMEDOUT;
633 memset(&e->msg, 0, sizeof(e->msg)); 617 memset(&e->msg, 0, sizeof(e->msg));
634 618
635 rtnl_unicast(skb, net, NETLINK_CB(skb).portid); 619 rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
636 } else { 620 } else {
637 kfree_skb(skb); 621 kfree_skb(skb);
638 } 622 }
@@ -671,7 +655,6 @@ static void ipmr_expire_process(unsigned long arg)
671 } 655 }
672 656
673 list_del(&c->list); 657 list_del(&c->list);
674 mroute_netlink_event(mrt, c, RTM_DELROUTE);
675 ipmr_destroy_unres(mrt, c); 658 ipmr_destroy_unres(mrt, c);
676 } 659 }
677 660
@@ -779,8 +762,6 @@ static int vif_add(struct net *net, struct mr_table *mrt,
779 return -EADDRNOTAVAIL; 762 return -EADDRNOTAVAIL;
780 } 763 }
781 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++; 764 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
782 inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, dev->ifindex,
783 &in_dev->cnf);
784 ip_rt_multicast_event(in_dev); 765 ip_rt_multicast_event(in_dev);
785 766
786 /* Fill in the VIF structures */ 767 /* Fill in the VIF structures */
@@ -879,7 +860,7 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
879 memset(&e->msg, 0, sizeof(e->msg)); 860 memset(&e->msg, 0, sizeof(e->msg));
880 } 861 }
881 862
882 rtnl_unicast(skb, net, NETLINK_CB(skb).portid); 863 rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
883 } else { 864 } else {
884 ip_mr_forward(net, mrt, skb, c, 0); 865 ip_mr_forward(net, mrt, skb, c, 0);
885 } 866 }
@@ -968,7 +949,8 @@ static int ipmr_cache_report(struct mr_table *mrt,
968 ret = sock_queue_rcv_skb(mroute_sk, skb); 949 ret = sock_queue_rcv_skb(mroute_sk, skb);
969 rcu_read_unlock(); 950 rcu_read_unlock();
970 if (ret < 0) { 951 if (ret < 0) {
971 net_warn_ratelimited("mroute: pending queue full, dropping entries\n"); 952 if (net_ratelimit())
953 printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
972 kfree_skb(skb); 954 kfree_skb(skb);
973 } 955 }
974 956
@@ -1029,7 +1011,6 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
1029 1011
1030 atomic_inc(&mrt->cache_resolve_queue_len); 1012 atomic_inc(&mrt->cache_resolve_queue_len);
1031 list_add(&c->list, &mrt->mfc_unres_queue); 1013 list_add(&c->list, &mrt->mfc_unres_queue);
1032 mroute_netlink_event(mrt, c, RTM_NEWROUTE);
1033 1014
1034 if (atomic_read(&mrt->cache_resolve_queue_len) == 1) 1015 if (atomic_read(&mrt->cache_resolve_queue_len) == 1)
1035 mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires); 1016 mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
@@ -1064,7 +1045,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
1064 if (c->mfc_origin == mfc->mfcc_origin.s_addr && 1045 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
1065 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) { 1046 c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
1066 list_del_rcu(&c->list); 1047 list_del_rcu(&c->list);
1067 mroute_netlink_event(mrt, c, RTM_DELROUTE); 1048
1068 ipmr_cache_free(c); 1049 ipmr_cache_free(c);
1069 return 0; 1050 return 0;
1070 } 1051 }
@@ -1099,7 +1080,6 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
1099 if (!mrtsock) 1080 if (!mrtsock)
1100 c->mfc_flags |= MFC_STATIC; 1081 c->mfc_flags |= MFC_STATIC;
1101 write_unlock_bh(&mrt_lock); 1082 write_unlock_bh(&mrt_lock);
1102 mroute_netlink_event(mrt, c, RTM_NEWROUTE);
1103 return 0; 1083 return 0;
1104 } 1084 }
1105 1085
@@ -1142,7 +1122,6 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
1142 ipmr_cache_resolve(net, mrt, uc, c); 1122 ipmr_cache_resolve(net, mrt, uc, c);
1143 ipmr_cache_free(uc); 1123 ipmr_cache_free(uc);
1144 } 1124 }
1145 mroute_netlink_event(mrt, c, RTM_NEWROUTE);
1146 return 0; 1125 return 0;
1147} 1126}
1148 1127
@@ -1171,7 +1150,6 @@ static void mroute_clean_tables(struct mr_table *mrt)
1171 if (c->mfc_flags & MFC_STATIC) 1150 if (c->mfc_flags & MFC_STATIC)
1172 continue; 1151 continue;
1173 list_del_rcu(&c->list); 1152 list_del_rcu(&c->list);
1174 mroute_netlink_event(mrt, c, RTM_DELROUTE);
1175 ipmr_cache_free(c); 1153 ipmr_cache_free(c);
1176 } 1154 }
1177 } 1155 }
@@ -1180,7 +1158,6 @@ static void mroute_clean_tables(struct mr_table *mrt)
1180 spin_lock_bh(&mfc_unres_lock); 1158 spin_lock_bh(&mfc_unres_lock);
1181 list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) { 1159 list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
1182 list_del(&c->list); 1160 list_del(&c->list);
1183 mroute_netlink_event(mrt, c, RTM_DELROUTE);
1184 ipmr_destroy_unres(mrt, c); 1161 ipmr_destroy_unres(mrt, c);
1185 } 1162 }
1186 spin_unlock_bh(&mfc_unres_lock); 1163 spin_unlock_bh(&mfc_unres_lock);
@@ -1199,10 +1176,7 @@ static void mrtsock_destruct(struct sock *sk)
1199 ipmr_for_each_table(mrt, net) { 1176 ipmr_for_each_table(mrt, net) {
1200 if (sk == rtnl_dereference(mrt->mroute_sk)) { 1177 if (sk == rtnl_dereference(mrt->mroute_sk)) {
1201 IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; 1178 IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
1202 inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, 1179 rcu_assign_pointer(mrt->mroute_sk, NULL);
1203 NETCONFA_IFINDEX_ALL,
1204 net->ipv4.devconf_all);
1205 RCU_INIT_POINTER(mrt->mroute_sk, NULL);
1206 mroute_clean_tables(mrt); 1180 mroute_clean_tables(mrt);
1207 } 1181 }
1208 } 1182 }
@@ -1224,24 +1198,23 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1224 struct net *net = sock_net(sk); 1198 struct net *net = sock_net(sk);
1225 struct mr_table *mrt; 1199 struct mr_table *mrt;
1226 1200
1227 if (sk->sk_type != SOCK_RAW ||
1228 inet_sk(sk)->inet_num != IPPROTO_IGMP)
1229 return -EOPNOTSUPP;
1230
1231 mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); 1201 mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1232 if (mrt == NULL) 1202 if (mrt == NULL)
1233 return -ENOENT; 1203 return -ENOENT;
1234 1204
1235 if (optname != MRT_INIT) { 1205 if (optname != MRT_INIT) {
1236 if (sk != rcu_access_pointer(mrt->mroute_sk) && 1206 if (sk != rcu_dereference_raw(mrt->mroute_sk) &&
1237 !ns_capable(net->user_ns, CAP_NET_ADMIN)) 1207 !capable(CAP_NET_ADMIN))
1238 return -EACCES; 1208 return -EACCES;
1239 } 1209 }
1240 1210
1241 switch (optname) { 1211 switch (optname) {
1242 case MRT_INIT: 1212 case MRT_INIT:
1213 if (sk->sk_type != SOCK_RAW ||
1214 inet_sk(sk)->inet_num != IPPROTO_IGMP)
1215 return -EOPNOTSUPP;
1243 if (optlen != sizeof(int)) 1216 if (optlen != sizeof(int))
1244 return -EINVAL; 1217 return -ENOPROTOOPT;
1245 1218
1246 rtnl_lock(); 1219 rtnl_lock();
1247 if (rtnl_dereference(mrt->mroute_sk)) { 1220 if (rtnl_dereference(mrt->mroute_sk)) {
@@ -1253,14 +1226,11 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1253 if (ret == 0) { 1226 if (ret == 0) {
1254 rcu_assign_pointer(mrt->mroute_sk, sk); 1227 rcu_assign_pointer(mrt->mroute_sk, sk);
1255 IPV4_DEVCONF_ALL(net, MC_FORWARDING)++; 1228 IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
1256 inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING,
1257 NETCONFA_IFINDEX_ALL,
1258 net->ipv4.devconf_all);
1259 } 1229 }
1260 rtnl_unlock(); 1230 rtnl_unlock();
1261 return ret; 1231 return ret;
1262 case MRT_DONE: 1232 case MRT_DONE:
1263 if (sk != rcu_access_pointer(mrt->mroute_sk)) 1233 if (sk != rcu_dereference_raw(mrt->mroute_sk))
1264 return -EACCES; 1234 return -EACCES;
1265 return ip_ra_control(sk, 0, NULL); 1235 return ip_ra_control(sk, 0, NULL);
1266 case MRT_ADD_VIF: 1236 case MRT_ADD_VIF:
@@ -1305,11 +1275,9 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1305 case MRT_ASSERT: 1275 case MRT_ASSERT:
1306 { 1276 {
1307 int v; 1277 int v;
1308 if (optlen != sizeof(v))
1309 return -EINVAL;
1310 if (get_user(v, (int __user *)optval)) 1278 if (get_user(v, (int __user *)optval))
1311 return -EFAULT; 1279 return -EFAULT;
1312 mrt->mroute_do_assert = v; 1280 mrt->mroute_do_assert = (v) ? 1 : 0;
1313 return 0; 1281 return 0;
1314 } 1282 }
1315#ifdef CONFIG_IP_PIMSM 1283#ifdef CONFIG_IP_PIMSM
@@ -1317,11 +1285,9 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1317 { 1285 {
1318 int v; 1286 int v;
1319 1287
1320 if (optlen != sizeof(v))
1321 return -EINVAL;
1322 if (get_user(v, (int __user *)optval)) 1288 if (get_user(v, (int __user *)optval))
1323 return -EFAULT; 1289 return -EFAULT;
1324 v = !!v; 1290 v = (v) ? 1 : 0;
1325 1291
1326 rtnl_lock(); 1292 rtnl_lock();
1327 ret = 0; 1293 ret = 0;
@@ -1343,10 +1309,6 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1343 if (get_user(v, (u32 __user *)optval)) 1309 if (get_user(v, (u32 __user *)optval))
1344 return -EFAULT; 1310 return -EFAULT;
1345 1311
1346 /* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */
1347 if (v != RT_TABLE_DEFAULT && v >= 1000000000)
1348 return -EINVAL;
1349
1350 rtnl_lock(); 1312 rtnl_lock();
1351 ret = 0; 1313 ret = 0;
1352 if (sk == rtnl_dereference(mrt->mroute_sk)) { 1314 if (sk == rtnl_dereference(mrt->mroute_sk)) {
@@ -1354,8 +1316,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
1354 } else { 1316 } else {
1355 if (!ipmr_new_table(net, v)) 1317 if (!ipmr_new_table(net, v))
1356 ret = -ENOMEM; 1318 ret = -ENOMEM;
1357 else 1319 raw_sk(sk)->ipmr_table = v;
1358 raw_sk(sk)->ipmr_table = v;
1359 } 1320 }
1360 rtnl_unlock(); 1321 rtnl_unlock();
1361 return ret; 1322 return ret;
@@ -1381,10 +1342,6 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
1381 struct net *net = sock_net(sk); 1342 struct net *net = sock_net(sk);
1382 struct mr_table *mrt; 1343 struct mr_table *mrt;
1383 1344
1384 if (sk->sk_type != SOCK_RAW ||
1385 inet_sk(sk)->inet_num != IPPROTO_IGMP)
1386 return -EOPNOTSUPP;
1387
1388 mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); 1345 mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1389 if (mrt == NULL) 1346 if (mrt == NULL)
1390 return -ENOENT; 1347 return -ENOENT;
@@ -1562,6 +1519,7 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v
1562 struct mr_table *mrt; 1519 struct mr_table *mrt;
1563 struct vif_device *v; 1520 struct vif_device *v;
1564 int ct; 1521 int ct;
1522 LIST_HEAD(list);
1565 1523
1566 if (event != NETDEV_UNREGISTER) 1524 if (event != NETDEV_UNREGISTER)
1567 return NOTIFY_DONE; 1525 return NOTIFY_DONE;
@@ -1570,9 +1528,10 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v
1570 v = &mrt->vif_table[0]; 1528 v = &mrt->vif_table[0];
1571 for (ct = 0; ct < mrt->maxvif; ct++, v++) { 1529 for (ct = 0; ct < mrt->maxvif; ct++, v++) {
1572 if (v->dev == dev) 1530 if (v->dev == dev)
1573 vif_delete(mrt, ct, 1, NULL); 1531 vif_delete(mrt, ct, 1, &list);
1574 } 1532 }
1575 } 1533 }
1534 unregister_netdevice_many(&list);
1576 return NOTIFY_DONE; 1535 return NOTIFY_DONE;
1577} 1536}
1578 1537
@@ -1618,7 +1577,6 @@ static inline int ipmr_forward_finish(struct sk_buff *skb)
1618 struct ip_options *opt = &(IPCB(skb)->opt); 1577 struct ip_options *opt = &(IPCB(skb)->opt);
1619 1578
1620 IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); 1579 IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
1621 IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len);
1622 1580
1623 if (unlikely(opt->optlen)) 1581 if (unlikely(opt->optlen))
1624 ip_forward_options(skb); 1582 ip_forward_options(skb);
@@ -1839,12 +1797,9 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
1839 .daddr = iph->daddr, 1797 .daddr = iph->daddr,
1840 .saddr = iph->saddr, 1798 .saddr = iph->saddr,
1841 .flowi4_tos = RT_TOS(iph->tos), 1799 .flowi4_tos = RT_TOS(iph->tos),
1842 .flowi4_oif = (rt_is_output_route(rt) ? 1800 .flowi4_oif = rt->rt_oif,
1843 skb->dev->ifindex : 0), 1801 .flowi4_iif = rt->rt_iif,
1844 .flowi4_iif = (rt_is_output_route(rt) ? 1802 .flowi4_mark = rt->rt_mark,
1845 LOOPBACK_IFINDEX :
1846 skb->dev->ifindex),
1847 .flowi4_mark = skb->mark,
1848 }; 1803 };
1849 struct mr_table *mrt; 1804 struct mr_table *mrt;
1850 int err; 1805 int err;
@@ -2053,44 +2008,37 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
2053{ 2008{
2054 int ct; 2009 int ct;
2055 struct rtnexthop *nhp; 2010 struct rtnexthop *nhp;
2056 struct nlattr *mp_attr; 2011 u8 *b = skb_tail_pointer(skb);
2057 struct rta_mfc_stats mfcs; 2012 struct rtattr *mp_head;
2058 2013
2059 /* If cache is unresolved, don't try to parse IIF and OIF */ 2014 /* If cache is unresolved, don't try to parse IIF and OIF */
2060 if (c->mfc_parent >= MAXVIFS) 2015 if (c->mfc_parent >= MAXVIFS)
2061 return -ENOENT; 2016 return -ENOENT;
2062 2017
2063 if (VIF_EXISTS(mrt, c->mfc_parent) && 2018 if (VIF_EXISTS(mrt, c->mfc_parent))
2064 nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0) 2019 RTA_PUT(skb, RTA_IIF, 4, &mrt->vif_table[c->mfc_parent].dev->ifindex);
2065 return -EMSGSIZE;
2066 2020
2067 if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH))) 2021 mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
2068 return -EMSGSIZE;
2069 2022
2070 for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { 2023 for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
2071 if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { 2024 if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
2072 if (!(nhp = nla_reserve_nohdr(skb, sizeof(*nhp)))) { 2025 if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
2073 nla_nest_cancel(skb, mp_attr); 2026 goto rtattr_failure;
2074 return -EMSGSIZE; 2027 nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
2075 }
2076
2077 nhp->rtnh_flags = 0; 2028 nhp->rtnh_flags = 0;
2078 nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; 2029 nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
2079 nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex; 2030 nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex;
2080 nhp->rtnh_len = sizeof(*nhp); 2031 nhp->rtnh_len = sizeof(*nhp);
2081 } 2032 }
2082 } 2033 }
2083 2034 mp_head->rta_type = RTA_MULTIPATH;
2084 nla_nest_end(skb, mp_attr); 2035 mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
2085
2086 mfcs.mfcs_packets = c->mfc_un.res.pkt;
2087 mfcs.mfcs_bytes = c->mfc_un.res.bytes;
2088 mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if;
2089 if (nla_put(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs) < 0)
2090 return -EMSGSIZE;
2091
2092 rtm->rtm_type = RTN_MULTICAST; 2036 rtm->rtm_type = RTN_MULTICAST;
2093 return 1; 2037 return 1;
2038
2039rtattr_failure:
2040 nlmsg_trim(skb, b);
2041 return -EMSGSIZE;
2094} 2042}
2095 2043
2096int ipmr_get_route(struct net *net, struct sk_buff *skb, 2044int ipmr_get_route(struct net *net, struct sk_buff *skb,
@@ -2158,13 +2106,12 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
2158} 2106}
2159 2107
2160static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, 2108static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
2161 u32 portid, u32 seq, struct mfc_cache *c, int cmd) 2109 u32 pid, u32 seq, struct mfc_cache *c)
2162{ 2110{
2163 struct nlmsghdr *nlh; 2111 struct nlmsghdr *nlh;
2164 struct rtmsg *rtm; 2112 struct rtmsg *rtm;
2165 int err;
2166 2113
2167 nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), NLM_F_MULTI); 2114 nlh = nlmsg_put(skb, pid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI);
2168 if (nlh == NULL) 2115 if (nlh == NULL)
2169 return -EMSGSIZE; 2116 return -EMSGSIZE;
2170 2117
@@ -2174,22 +2121,16 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
2174 rtm->rtm_src_len = 32; 2121 rtm->rtm_src_len = 32;
2175 rtm->rtm_tos = 0; 2122 rtm->rtm_tos = 0;
2176 rtm->rtm_table = mrt->id; 2123 rtm->rtm_table = mrt->id;
2177 if (nla_put_u32(skb, RTA_TABLE, mrt->id)) 2124 NLA_PUT_U32(skb, RTA_TABLE, mrt->id);
2178 goto nla_put_failure;
2179 rtm->rtm_type = RTN_MULTICAST; 2125 rtm->rtm_type = RTN_MULTICAST;
2180 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 2126 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2181 if (c->mfc_flags & MFC_STATIC) 2127 rtm->rtm_protocol = RTPROT_UNSPEC;
2182 rtm->rtm_protocol = RTPROT_STATIC;
2183 else
2184 rtm->rtm_protocol = RTPROT_MROUTED;
2185 rtm->rtm_flags = 0; 2128 rtm->rtm_flags = 0;
2186 2129
2187 if (nla_put_be32(skb, RTA_SRC, c->mfc_origin) || 2130 NLA_PUT_BE32(skb, RTA_SRC, c->mfc_origin);
2188 nla_put_be32(skb, RTA_DST, c->mfc_mcastgrp)) 2131 NLA_PUT_BE32(skb, RTA_DST, c->mfc_mcastgrp);
2189 goto nla_put_failure; 2132
2190 err = __ipmr_fill_mroute(mrt, skb, c, rtm); 2133 if (__ipmr_fill_mroute(mrt, skb, c, rtm) < 0)
2191 /* do not break the dump if cache is unresolved */
2192 if (err < 0 && err != -ENOENT)
2193 goto nla_put_failure; 2134 goto nla_put_failure;
2194 2135
2195 return nlmsg_end(skb, nlh); 2136 return nlmsg_end(skb, nlh);
@@ -2199,52 +2140,6 @@ nla_put_failure:
2199 return -EMSGSIZE; 2140 return -EMSGSIZE;
2200} 2141}
2201 2142
2202static size_t mroute_msgsize(bool unresolved, int maxvif)
2203{
2204 size_t len =
2205 NLMSG_ALIGN(sizeof(struct rtmsg))
2206 + nla_total_size(4) /* RTA_TABLE */
2207 + nla_total_size(4) /* RTA_SRC */
2208 + nla_total_size(4) /* RTA_DST */
2209 ;
2210
2211 if (!unresolved)
2212 len = len
2213 + nla_total_size(4) /* RTA_IIF */
2214 + nla_total_size(0) /* RTA_MULTIPATH */
2215 + maxvif * NLA_ALIGN(sizeof(struct rtnexthop))
2216 /* RTA_MFC_STATS */
2217 + nla_total_size(sizeof(struct rta_mfc_stats))
2218 ;
2219
2220 return len;
2221}
2222
2223static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
2224 int cmd)
2225{
2226 struct net *net = read_pnet(&mrt->net);
2227 struct sk_buff *skb;
2228 int err = -ENOBUFS;
2229
2230 skb = nlmsg_new(mroute_msgsize(mfc->mfc_parent >= MAXVIFS, mrt->maxvif),
2231 GFP_ATOMIC);
2232 if (skb == NULL)
2233 goto errout;
2234
2235 err = ipmr_fill_mroute(mrt, skb, 0, 0, mfc, cmd);
2236 if (err < 0)
2237 goto errout;
2238
2239 rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE, NULL, GFP_ATOMIC);
2240 return;
2241
2242errout:
2243 kfree_skb(skb);
2244 if (err < 0)
2245 rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err);
2246}
2247
2248static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) 2143static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
2249{ 2144{
2250 struct net *net = sock_net(skb->sk); 2145 struct net *net = sock_net(skb->sk);
@@ -2269,31 +2164,15 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
2269 if (e < s_e) 2164 if (e < s_e)
2270 goto next_entry; 2165 goto next_entry;
2271 if (ipmr_fill_mroute(mrt, skb, 2166 if (ipmr_fill_mroute(mrt, skb,
2272 NETLINK_CB(cb->skb).portid, 2167 NETLINK_CB(cb->skb).pid,
2273 cb->nlh->nlmsg_seq, 2168 cb->nlh->nlmsg_seq,
2274 mfc, RTM_NEWROUTE) < 0) 2169 mfc) < 0)
2275 goto done; 2170 goto done;
2276next_entry: 2171next_entry:
2277 e++; 2172 e++;
2278 } 2173 }
2279 e = s_e = 0; 2174 e = s_e = 0;
2280 } 2175 }
2281 spin_lock_bh(&mfc_unres_lock);
2282 list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
2283 if (e < s_e)
2284 goto next_entry2;
2285 if (ipmr_fill_mroute(mrt, skb,
2286 NETLINK_CB(cb->skb).portid,
2287 cb->nlh->nlmsg_seq,
2288 mfc, RTM_NEWROUTE) < 0) {
2289 spin_unlock_bh(&mfc_unres_lock);
2290 goto done;
2291 }
2292next_entry2:
2293 e++;
2294 }
2295 spin_unlock_bh(&mfc_unres_lock);
2296 e = s_e = 0;
2297 s_h = 0; 2176 s_h = 0;
2298next_table: 2177next_table:
2299 t++; 2178 t++;
@@ -2660,7 +2539,7 @@ int __init ip_mr_init(void)
2660 goto reg_notif_fail; 2539 goto reg_notif_fail;
2661#ifdef CONFIG_IP_PIMSM_V2 2540#ifdef CONFIG_IP_PIMSM_V2
2662 if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) { 2541 if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) {
2663 pr_err("%s: can't add PIM protocol\n", __func__); 2542 printk(KERN_ERR "ip_mr_init: can't add PIM protocol\n");
2664 err = -EAGAIN; 2543 err = -EAGAIN;
2665 goto add_proto_fail; 2544 goto add_proto_fail;
2666 } 2545 }
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 4c0cf63dd92..929b27bdeb7 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -5,14 +5,13 @@
5#include <linux/ip.h> 5#include <linux/ip.h>
6#include <linux/skbuff.h> 6#include <linux/skbuff.h>
7#include <linux/gfp.h> 7#include <linux/gfp.h>
8#include <linux/export.h>
9#include <net/route.h> 8#include <net/route.h>
10#include <net/xfrm.h> 9#include <net/xfrm.h>
11#include <net/ip.h> 10#include <net/ip.h>
12#include <net/netfilter/nf_queue.h> 11#include <net/netfilter/nf_queue.h>
13 12
14/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ 13/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
15int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type) 14int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
16{ 15{
17 struct net *net = dev_net(skb_dst(skb)->dev); 16 struct net *net = dev_net(skb_dst(skb)->dev);
18 const struct iphdr *iph = ip_hdr(skb); 17 const struct iphdr *iph = ip_hdr(skb);
@@ -64,14 +63,50 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type)
64 /* Change in oif may mean change in hh_len. */ 63 /* Change in oif may mean change in hh_len. */
65 hh_len = skb_dst(skb)->dev->hard_header_len; 64 hh_len = skb_dst(skb)->dev->hard_header_len;
66 if (skb_headroom(skb) < hh_len && 65 if (skb_headroom(skb) < hh_len &&
67 pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)), 66 pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
68 0, GFP_ATOMIC))
69 return -1; 67 return -1;
70 68
71 return 0; 69 return 0;
72} 70}
73EXPORT_SYMBOL(ip_route_me_harder); 71EXPORT_SYMBOL(ip_route_me_harder);
74 72
73#ifdef CONFIG_XFRM
74int ip_xfrm_me_harder(struct sk_buff *skb)
75{
76 struct flowi fl;
77 unsigned int hh_len;
78 struct dst_entry *dst;
79
80 if (IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED)
81 return 0;
82 if (xfrm_decode_session(skb, &fl, AF_INET) < 0)
83 return -1;
84
85 dst = skb_dst(skb);
86 if (dst->xfrm)
87 dst = ((struct xfrm_dst *)dst)->route;
88 dst_hold(dst);
89
90 dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0);
91 if (IS_ERR(dst))
92 return -1;
93
94 skb_dst_drop(skb);
95 skb_dst_set(skb, dst);
96
97 /* Change in oif may mean change in hh_len. */
98 hh_len = skb_dst(skb)->dev->hard_header_len;
99 if (skb_headroom(skb) < hh_len &&
100 pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
101 return -1;
102 return 0;
103}
104EXPORT_SYMBOL(ip_xfrm_me_harder);
105#endif
106
107void (*ip_nat_decode_session)(struct sk_buff *, struct flowi *);
108EXPORT_SYMBOL(ip_nat_decode_session);
109
75/* 110/*
76 * Extra routing may needed on local out, as the QUEUE target never 111 * Extra routing may needed on local out, as the QUEUE target never
77 * returns control to the table. 112 * returns control to the table.
@@ -188,15 +223,25 @@ static const struct nf_afinfo nf_ip_afinfo = {
188 .route_key_size = sizeof(struct ip_rt_info), 223 .route_key_size = sizeof(struct ip_rt_info),
189}; 224};
190 225
191static int __init ipv4_netfilter_init(void) 226static int ipv4_netfilter_init(void)
192{ 227{
193 return nf_register_afinfo(&nf_ip_afinfo); 228 return nf_register_afinfo(&nf_ip_afinfo);
194} 229}
195 230
196static void __exit ipv4_netfilter_fini(void) 231static void ipv4_netfilter_fini(void)
197{ 232{
198 nf_unregister_afinfo(&nf_ip_afinfo); 233 nf_unregister_afinfo(&nf_ip_afinfo);
199} 234}
200 235
201module_init(ipv4_netfilter_init); 236module_init(ipv4_netfilter_init);
202module_exit(ipv4_netfilter_fini); 237module_exit(ipv4_netfilter_fini);
238
239#ifdef CONFIG_SYSCTL
240struct ctl_path nf_net_ipv4_netfilter_sysctl_path[] = {
241 { .procname = "net", },
242 { .procname = "ipv4", },
243 { .procname = "netfilter", },
244 { }
245};
246EXPORT_SYMBOL_GPL(nf_net_ipv4_netfilter_sysctl_path);
247#endif /* CONFIG_SYSCTL */
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index d8d6f2a5bf1..73b4e91a87e 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -27,7 +27,7 @@ config NF_CONNTRACK_IPV4
27 27
28config NF_CONNTRACK_PROC_COMPAT 28config NF_CONNTRACK_PROC_COMPAT
29 bool "proc/sysctl compatibility with old connection tracking" 29 bool "proc/sysctl compatibility with old connection tracking"
30 depends on NF_CONNTRACK_PROCFS && NF_CONNTRACK_IPV4 30 depends on NF_CONNTRACK_IPV4
31 default y 31 default y
32 help 32 help
33 This option enables /proc and sysctl compatibility with the old 33 This option enables /proc and sysctl compatibility with the old
@@ -76,21 +76,11 @@ config IP_NF_MATCH_AH
76config IP_NF_MATCH_ECN 76config IP_NF_MATCH_ECN
77 tristate '"ecn" match support' 77 tristate '"ecn" match support'
78 depends on NETFILTER_ADVANCED 78 depends on NETFILTER_ADVANCED
79 select NETFILTER_XT_MATCH_ECN 79 help
80 ---help--- 80 This option adds a `ECN' match, which allows you to match against
81 This is a backwards-compat option for the user's convenience 81 the IPv4 and TCP header ECN fields.
82 (e.g. when running oldconfig). It selects
83 CONFIG_NETFILTER_XT_MATCH_ECN.
84
85config IP_NF_MATCH_RPFILTER
86 tristate '"rpfilter" reverse path filter match support'
87 depends on NETFILTER_ADVANCED
88 ---help---
89 This option allows you to match packets whose replies would
90 go out via the interface the packet came in.
91 82
92 To compile it as a module, choose M here. If unsure, say N. 83 To compile it as a module, choose M here. If unsure, say N.
93 The module will be called ipt_rpfilter.
94 84
95config IP_NF_MATCH_TTL 85config IP_NF_MATCH_TTL
96 tristate '"ttl" match support' 86 tristate '"ttl" match support'
@@ -123,6 +113,27 @@ config IP_NF_TARGET_REJECT
123 113
124 To compile it as a module, choose M here. If unsure, say N. 114 To compile it as a module, choose M here. If unsure, say N.
125 115
116config IP_NF_TARGET_REJECT_SKERR
117 bool "Force socket error when rejecting with icmp*"
118 depends on IP_NF_TARGET_REJECT
119 default n
120 help
121 This option enables turning a "--reject-with icmp*" into a matching
122 socket error also.
123 The REJECT target normally allows sending an ICMP message. But it
124 leaves the local socket unaware of any ingress rejects.
125
126 If unsure, say N.
127
128config IP_NF_TARGET_LOG
129 tristate "LOG target support"
130 default m if NETFILTER_ADVANCED=n
131 help
132 This option adds a `LOG' target, which allows you to create rules in
133 any iptables table which records the packet header to the syslog.
134
135 To compile it as a module, choose M here. If unsure, say N.
136
126config IP_NF_TARGET_ULOG 137config IP_NF_TARGET_ULOG
127 tristate "ULOG target support" 138 tristate "ULOG target support"
128 default m if NETFILTER_ADVANCED=n 139 default m if NETFILTER_ADVANCED=n
@@ -143,22 +154,25 @@ config IP_NF_TARGET_ULOG
143 To compile it as a module, choose M here. If unsure, say N. 154 To compile it as a module, choose M here. If unsure, say N.
144 155
145# NAT + specific targets: nf_conntrack 156# NAT + specific targets: nf_conntrack
146config NF_NAT_IPV4 157config NF_NAT
147 tristate "IPv4 NAT" 158 tristate "Full NAT"
148 depends on NF_CONNTRACK_IPV4 159 depends on NF_CONNTRACK_IPV4
149 default m if NETFILTER_ADVANCED=n 160 default m if NETFILTER_ADVANCED=n
150 select NF_NAT
151 help 161 help
152 The IPv4 NAT option allows masquerading, port forwarding and other 162 The Full NAT option allows masquerading, port forwarding and other
153 forms of full Network Address Port Translation. It is controlled by 163 forms of full Network Address Port Translation. It is controlled by
154 the `nat' table in iptables: see the man page for iptables(8). 164 the `nat' table in iptables: see the man page for iptables(8).
155 165
156 To compile it as a module, choose M here. If unsure, say N. 166 To compile it as a module, choose M here. If unsure, say N.
157 167
158if NF_NAT_IPV4 168config NF_NAT_NEEDED
169 bool
170 depends on NF_NAT
171 default y
159 172
160config IP_NF_TARGET_MASQUERADE 173config IP_NF_TARGET_MASQUERADE
161 tristate "MASQUERADE target support" 174 tristate "MASQUERADE target support"
175 depends on NF_NAT
162 default m if NETFILTER_ADVANCED=n 176 default m if NETFILTER_ADVANCED=n
163 help 177 help
164 Masquerading is a special case of NAT: all outgoing connections are 178 Masquerading is a special case of NAT: all outgoing connections are
@@ -171,27 +185,30 @@ config IP_NF_TARGET_MASQUERADE
171 185
172config IP_NF_TARGET_NETMAP 186config IP_NF_TARGET_NETMAP
173 tristate "NETMAP target support" 187 tristate "NETMAP target support"
188 depends on NF_NAT
174 depends on NETFILTER_ADVANCED 189 depends on NETFILTER_ADVANCED
175 select NETFILTER_XT_TARGET_NETMAP 190 help
176 ---help--- 191 NETMAP is an implementation of static 1:1 NAT mapping of network
177 This is a backwards-compat option for the user's convenience 192 addresses. It maps the network address part, while keeping the host
178 (e.g. when running oldconfig). It selects 193 address part intact.
179 CONFIG_NETFILTER_XT_TARGET_NETMAP. 194
195 To compile it as a module, choose M here. If unsure, say N.
180 196
181config IP_NF_TARGET_REDIRECT 197config IP_NF_TARGET_REDIRECT
182 tristate "REDIRECT target support" 198 tristate "REDIRECT target support"
199 depends on NF_NAT
183 depends on NETFILTER_ADVANCED 200 depends on NETFILTER_ADVANCED
184 select NETFILTER_XT_TARGET_REDIRECT 201 help
185 ---help--- 202 REDIRECT is a special case of NAT: all incoming connections are
186 This is a backwards-compat option for the user's convenience 203 mapped onto the incoming interface's address, causing the packets to
187 (e.g. when running oldconfig). It selects 204 come to the local machine instead of passing through. This is
188 CONFIG_NETFILTER_XT_TARGET_REDIRECT. 205 useful for transparent proxies.
189 206
190endif 207 To compile it as a module, choose M here. If unsure, say N.
191 208
192config NF_NAT_SNMP_BASIC 209config NF_NAT_SNMP_BASIC
193 tristate "Basic SNMP-ALG support" 210 tristate "Basic SNMP-ALG support"
194 depends on NF_CONNTRACK_SNMP && NF_NAT_IPV4 211 depends on NF_CONNTRACK_SNMP && NF_NAT
195 depends on NETFILTER_ADVANCED 212 depends on NETFILTER_ADVANCED
196 default NF_NAT && NF_CONNTRACK_SNMP 213 default NF_NAT && NF_CONNTRACK_SNMP
197 ---help--- 214 ---help---
@@ -213,21 +230,61 @@ config NF_NAT_SNMP_BASIC
213# <expr> '&&' <expr> (6) 230# <expr> '&&' <expr> (6)
214# 231#
215# (6) Returns the result of min(/expr/, /expr/). 232# (6) Returns the result of min(/expr/, /expr/).
233config NF_NAT_PROTO_DCCP
234 tristate
235 depends on NF_NAT && NF_CT_PROTO_DCCP
236 default NF_NAT && NF_CT_PROTO_DCCP
216 237
217config NF_NAT_PROTO_GRE 238config NF_NAT_PROTO_GRE
218 tristate 239 tristate
219 depends on NF_NAT_IPV4 && NF_CT_PROTO_GRE 240 depends on NF_NAT && NF_CT_PROTO_GRE
241
242config NF_NAT_PROTO_UDPLITE
243 tristate
244 depends on NF_NAT && NF_CT_PROTO_UDPLITE
245 default NF_NAT && NF_CT_PROTO_UDPLITE
246
247config NF_NAT_PROTO_SCTP
248 tristate
249 default NF_NAT && NF_CT_PROTO_SCTP
250 depends on NF_NAT && NF_CT_PROTO_SCTP
251 select LIBCRC32C
252
253config NF_NAT_FTP
254 tristate
255 depends on NF_CONNTRACK && NF_NAT
256 default NF_NAT && NF_CONNTRACK_FTP
257
258config NF_NAT_IRC
259 tristate
260 depends on NF_CONNTRACK && NF_NAT
261 default NF_NAT && NF_CONNTRACK_IRC
262
263config NF_NAT_TFTP
264 tristate
265 depends on NF_CONNTRACK && NF_NAT
266 default NF_NAT && NF_CONNTRACK_TFTP
267
268config NF_NAT_AMANDA
269 tristate
270 depends on NF_CONNTRACK && NF_NAT
271 default NF_NAT && NF_CONNTRACK_AMANDA
220 272
221config NF_NAT_PPTP 273config NF_NAT_PPTP
222 tristate 274 tristate
223 depends on NF_CONNTRACK && NF_NAT_IPV4 275 depends on NF_CONNTRACK && NF_NAT
224 default NF_NAT_IPV4 && NF_CONNTRACK_PPTP 276 default NF_NAT && NF_CONNTRACK_PPTP
225 select NF_NAT_PROTO_GRE 277 select NF_NAT_PROTO_GRE
226 278
227config NF_NAT_H323 279config NF_NAT_H323
228 tristate 280 tristate
229 depends on NF_CONNTRACK && NF_NAT_IPV4 281 depends on NF_CONNTRACK && NF_NAT
230 default NF_NAT_IPV4 && NF_CONNTRACK_H323 282 default NF_NAT && NF_CONNTRACK_H323
283
284config NF_NAT_SIP
285 tristate
286 depends on NF_CONNTRACK && NF_NAT
287 default NF_NAT && NF_CONNTRACK_SIP
231 288
232# mangle + specific targets 289# mangle + specific targets
233config IP_NF_MANGLE 290config IP_NF_MANGLE
@@ -280,6 +337,7 @@ config IP_NF_TARGET_TTL
280# raw + specific targets 337# raw + specific targets
281config IP_NF_RAW 338config IP_NF_RAW
282 tristate 'raw table support (required for NOTRACK/TRACE)' 339 tristate 'raw table support (required for NOTRACK/TRACE)'
340 depends on NETFILTER_ADVANCED
283 help 341 help
284 This option adds a `raw' table to iptables. This table is the very 342 This option adds a `raw' table to iptables. This table is the very
285 first in the netfilter framework and hooks in at the PREROUTING 343 first in the netfilter framework and hooks in at the PREROUTING
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 007b128eecc..dca2082ec68 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -10,22 +10,32 @@ nf_conntrack_ipv4-objs += nf_conntrack_l3proto_ipv4_compat.o
10endif 10endif
11endif 11endif
12 12
13nf_nat-y := nf_nat_core.o nf_nat_helper.o nf_nat_proto_unknown.o nf_nat_proto_common.o nf_nat_proto_tcp.o nf_nat_proto_udp.o nf_nat_proto_icmp.o
14iptable_nat-y := nf_nat_rule.o nf_nat_standalone.o
15
13# connection tracking 16# connection tracking
14obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o 17obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
15 18
16nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o 19obj-$(CONFIG_NF_NAT) += nf_nat.o
17obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
18 20
19# defrag 21# defrag
20obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o 22obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
21 23
22# NAT helpers (nf_conntrack) 24# NAT helpers (nf_conntrack)
25obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
26obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o
23obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o 27obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o
28obj-$(CONFIG_NF_NAT_IRC) += nf_nat_irc.o
24obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o 29obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o
30obj-$(CONFIG_NF_NAT_SIP) += nf_nat_sip.o
25obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o 31obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
32obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o
26 33
27# NAT protocols (nf_nat) 34# NAT protocols (nf_nat)
35obj-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o
28obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o 36obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o
37obj-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o
38obj-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o
29 39
30# generic IP tables 40# generic IP tables
31obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o 41obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
@@ -33,18 +43,21 @@ obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
33# the three instances of ip_tables 43# the three instances of ip_tables
34obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o 44obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
35obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o 45obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o
36obj-$(CONFIG_NF_NAT_IPV4) += iptable_nat.o 46obj-$(CONFIG_NF_NAT) += iptable_nat.o
37obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o 47obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
38obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o 48obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o
39 49
40# matches 50# matches
41obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o 51obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o
42obj-$(CONFIG_IP_NF_MATCH_RPFILTER) += ipt_rpfilter.o 52obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o
43 53
44# targets 54# targets
45obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o 55obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
46obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o 56obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
57obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o
47obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o 58obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
59obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o
60obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o
48obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o 61obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
49obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o 62obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
50 63
@@ -54,3 +67,6 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
54 67
55# just filtering instance of ARP tables for now 68# just filtering instance of ARP tables for now
56obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o 69obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
70
71obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o
72
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 3ea4127404d..fd7a3f68917 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -221,8 +221,9 @@ static inline int arp_checkentry(const struct arpt_arp *arp)
221static unsigned int 221static unsigned int
222arpt_error(struct sk_buff *skb, const struct xt_action_param *par) 222arpt_error(struct sk_buff *skb, const struct xt_action_param *par)
223{ 223{
224 net_err_ratelimited("arp_tables: error: '%s'\n", 224 if (net_ratelimit())
225 (const char *)par->targinfo); 225 pr_err("arp_tables: error: '%s'\n",
226 (const char *)par->targinfo);
226 227
227 return NF_DROP; 228 return NF_DROP;
228} 229}
@@ -302,7 +303,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
302 if (v < 0) { 303 if (v < 0) {
303 /* Pop from stack? */ 304 /* Pop from stack? */
304 if (v != XT_RETURN) { 305 if (v != XT_RETURN) {
305 verdict = (unsigned int)(-v) - 1; 306 verdict = (unsigned)(-v) - 1;
306 break; 307 break;
307 } 308 }
308 e = back; 309 e = back;
@@ -1533,7 +1534,7 @@ static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user,
1533{ 1534{
1534 int ret; 1535 int ret;
1535 1536
1536 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 1537 if (!capable(CAP_NET_ADMIN))
1537 return -EPERM; 1538 return -EPERM;
1538 1539
1539 switch (cmd) { 1540 switch (cmd) {
@@ -1677,7 +1678,7 @@ static int compat_do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user,
1677{ 1678{
1678 int ret; 1679 int ret;
1679 1680
1680 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 1681 if (!capable(CAP_NET_ADMIN))
1681 return -EPERM; 1682 return -EPERM;
1682 1683
1683 switch (cmd) { 1684 switch (cmd) {
@@ -1698,7 +1699,7 @@ static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned
1698{ 1699{
1699 int ret; 1700 int ret;
1700 1701
1701 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 1702 if (!capable(CAP_NET_ADMIN))
1702 return -EPERM; 1703 return -EPERM;
1703 1704
1704 switch (cmd) { 1705 switch (cmd) {
@@ -1722,7 +1723,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
1722{ 1723{
1723 int ret; 1724 int ret;
1724 1725
1725 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 1726 if (!capable(CAP_NET_ADMIN))
1726 return -EPERM; 1727 return -EPERM;
1727 1728
1728 switch (cmd) { 1729 switch (cmd) {
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 17c5e06da66..24e556e83a3 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -153,7 +153,8 @@ ip_checkentry(const struct ipt_ip *ip)
153static unsigned int 153static unsigned int
154ipt_error(struct sk_buff *skb, const struct xt_action_param *par) 154ipt_error(struct sk_buff *skb, const struct xt_action_param *par)
155{ 155{
156 net_info_ratelimited("error: `%s'\n", (const char *)par->targinfo); 156 if (net_ratelimit())
157 pr_info("error: `%s'\n", (const char *)par->targinfo);
157 158
158 return NF_DROP; 159 return NF_DROP;
159} 160}
@@ -376,7 +377,7 @@ ipt_do_table(struct sk_buff *skb,
376 if (v < 0) { 377 if (v < 0) {
377 /* Pop from stack? */ 378 /* Pop from stack? */
378 if (v != XT_RETURN) { 379 if (v != XT_RETURN) {
379 verdict = (unsigned int)(-v) - 1; 380 verdict = (unsigned)(-v) - 1;
380 break; 381 break;
381 } 382 }
382 if (*stackptr <= origptr) { 383 if (*stackptr <= origptr) {
@@ -1846,7 +1847,7 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user,
1846{ 1847{
1847 int ret; 1848 int ret;
1848 1849
1849 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 1850 if (!capable(CAP_NET_ADMIN))
1850 return -EPERM; 1851 return -EPERM;
1851 1852
1852 switch (cmd) { 1853 switch (cmd) {
@@ -1961,7 +1962,7 @@ compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
1961{ 1962{
1962 int ret; 1963 int ret;
1963 1964
1964 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 1965 if (!capable(CAP_NET_ADMIN))
1965 return -EPERM; 1966 return -EPERM;
1966 1967
1967 switch (cmd) { 1968 switch (cmd) {
@@ -1983,7 +1984,7 @@ do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1983{ 1984{
1984 int ret; 1985 int ret;
1985 1986
1986 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 1987 if (!capable(CAP_NET_ADMIN))
1987 return -EPERM; 1988 return -EPERM;
1988 1989
1989 switch (cmd) { 1990 switch (cmd) {
@@ -2008,7 +2009,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2008{ 2009{
2009 int ret; 2010 int ret;
2010 2011
2011 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 2012 if (!capable(CAP_NET_ADMIN))
2012 return -EPERM; 2013 return -EPERM;
2013 2014
2014 switch (cmd) { 2015 switch (cmd) {
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 75e33a7048f..db8d22db425 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -246,7 +246,8 @@ clusterip_hashfn(const struct sk_buff *skb,
246 dport = ports[1]; 246 dport = ports[1];
247 } 247 }
248 } else { 248 } else {
249 net_info_ratelimited("unknown protocol %u\n", iph->protocol); 249 if (net_ratelimit())
250 pr_info("unknown protocol %u\n", iph->protocol);
250 } 251 }
251 252
252 switch (config->hash_mode) { 253 switch (config->hash_mode) {
@@ -394,6 +395,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
394 config = clusterip_config_init(cipinfo, 395 config = clusterip_config_init(cipinfo,
395 e->ip.dst.s_addr, dev); 396 e->ip.dst.s_addr, dev);
396 if (!config) { 397 if (!config) {
398 pr_info("cannot allocate config\n");
397 dev_put(dev); 399 dev_put(dev);
398 return -ENOMEM; 400 return -ENOMEM;
399 } 401 }
@@ -661,7 +663,6 @@ static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
661#define PROC_WRITELEN 10 663#define PROC_WRITELEN 10
662 char buffer[PROC_WRITELEN+1]; 664 char buffer[PROC_WRITELEN+1];
663 unsigned long nodenum; 665 unsigned long nodenum;
664 int rc;
665 666
666 if (size > PROC_WRITELEN) 667 if (size > PROC_WRITELEN)
667 return -EIO; 668 return -EIO;
@@ -670,15 +671,11 @@ static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
670 buffer[size] = 0; 671 buffer[size] = 0;
671 672
672 if (*buffer == '+') { 673 if (*buffer == '+') {
673 rc = kstrtoul(buffer+1, 10, &nodenum); 674 nodenum = simple_strtoul(buffer+1, NULL, 10);
674 if (rc)
675 return rc;
676 if (clusterip_add_node(c, nodenum)) 675 if (clusterip_add_node(c, nodenum))
677 return -ENOMEM; 676 return -ENOMEM;
678 } else if (*buffer == '-') { 677 } else if (*buffer == '-') {
679 rc = kstrtoul(buffer+1, 10, &nodenum); 678 nodenum = simple_strtoul(buffer+1, NULL,10);
680 if (rc)
681 return rc;
682 if (clusterip_del_node(c, nodenum)) 679 if (clusterip_del_node(c, nodenum))
683 return -ENOENT; 680 return -ENOENT;
684 } else 681 } else
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 5d5d4d1be9c..9931152a78b 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -19,9 +19,9 @@
19#include <net/ip.h> 19#include <net/ip.h>
20#include <net/checksum.h> 20#include <net/checksum.h>
21#include <net/route.h> 21#include <net/route.h>
22#include <net/netfilter/nf_nat_rule.h>
22#include <linux/netfilter_ipv4.h> 23#include <linux/netfilter_ipv4.h>
23#include <linux/netfilter/x_tables.h> 24#include <linux/netfilter/x_tables.h>
24#include <net/netfilter/nf_nat.h>
25 25
26MODULE_LICENSE("GPL"); 26MODULE_LICENSE("GPL");
27MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); 27MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
@@ -30,9 +30,9 @@ MODULE_DESCRIPTION("Xtables: automatic-address SNAT");
30/* FIXME: Multiple targets. --RR */ 30/* FIXME: Multiple targets. --RR */
31static int masquerade_tg_check(const struct xt_tgchk_param *par) 31static int masquerade_tg_check(const struct xt_tgchk_param *par)
32{ 32{
33 const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; 33 const struct nf_nat_multi_range_compat *mr = par->targinfo;
34 34
35 if (mr->range[0].flags & NF_NAT_RANGE_MAP_IPS) { 35 if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
36 pr_debug("bad MAP_IPS.\n"); 36 pr_debug("bad MAP_IPS.\n");
37 return -EINVAL; 37 return -EINVAL;
38 } 38 }
@@ -50,9 +50,9 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
50 struct nf_conn_nat *nat; 50 struct nf_conn_nat *nat;
51 enum ip_conntrack_info ctinfo; 51 enum ip_conntrack_info ctinfo;
52 struct nf_nat_range newrange; 52 struct nf_nat_range newrange;
53 const struct nf_nat_ipv4_multi_range_compat *mr; 53 const struct nf_nat_multi_range_compat *mr;
54 const struct rtable *rt; 54 const struct rtable *rt;
55 __be32 newsrc, nh; 55 __be32 newsrc;
56 56
57 NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING); 57 NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING);
58 58
@@ -70,8 +70,7 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
70 70
71 mr = par->targinfo; 71 mr = par->targinfo;
72 rt = skb_rtable(skb); 72 rt = skb_rtable(skb);
73 nh = rt_nexthop(rt, ip_hdr(skb)->daddr); 73 newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE);
74 newsrc = inet_select_addr(par->out, nh, RT_SCOPE_UNIVERSE);
75 if (!newsrc) { 74 if (!newsrc) {
76 pr_info("%s ate my IP address\n", par->out->name); 75 pr_info("%s ate my IP address\n", par->out->name);
77 return NF_DROP; 76 return NF_DROP;
@@ -80,16 +79,13 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
80 nat->masq_index = par->out->ifindex; 79 nat->masq_index = par->out->ifindex;
81 80
82 /* Transfer from original range. */ 81 /* Transfer from original range. */
83 memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); 82 newrange = ((struct nf_nat_range)
84 memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); 83 { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS,
85 newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS; 84 newsrc, newsrc,
86 newrange.min_addr.ip = newsrc; 85 mr->range[0].min, mr->range[0].max });
87 newrange.max_addr.ip = newsrc;
88 newrange.min_proto = mr->range[0].min;
89 newrange.max_proto = mr->range[0].max;
90 86
91 /* Hand modified range to generic setup. */ 87 /* Hand modified range to generic setup. */
92 return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); 88 return nf_nat_setup_info(ct, &newrange, IP_NAT_MANIP_SRC);
93} 89}
94 90
95static int 91static int
@@ -99,8 +95,7 @@ device_cmp(struct nf_conn *i, void *ifindex)
99 95
100 if (!nat) 96 if (!nat)
101 return 0; 97 return 0;
102 if (nf_ct_l3num(i) != NFPROTO_IPV4) 98
103 return 0;
104 return nat->masq_index == (int)(long)ifindex; 99 return nat->masq_index == (int)(long)ifindex;
105} 100}
106 101
@@ -144,7 +139,7 @@ static struct xt_target masquerade_tg_reg __read_mostly = {
144 .name = "MASQUERADE", 139 .name = "MASQUERADE",
145 .family = NFPROTO_IPV4, 140 .family = NFPROTO_IPV4,
146 .target = masquerade_tg, 141 .target = masquerade_tg,
147 .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), 142 .targetsize = sizeof(struct nf_nat_multi_range_compat),
148 .table = "nat", 143 .table = "nat",
149 .hooks = 1 << NF_INET_POST_ROUTING, 144 .hooks = 1 << NF_INET_POST_ROUTING,
150 .checkentry = masquerade_tg_check, 145 .checkentry = masquerade_tg_check,
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 04b18c1ac34..9dd754c7f2b 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -81,7 +81,6 @@ static void send_reset(struct sk_buff *oldskb, int hook)
81 niph->saddr = oiph->daddr; 81 niph->saddr = oiph->daddr;
82 niph->daddr = oiph->saddr; 82 niph->daddr = oiph->saddr;
83 83
84 skb_reset_transport_header(nskb);
85 tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); 84 tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr));
86 memset(tcph, 0, sizeof(*tcph)); 85 memset(tcph, 0, sizeof(*tcph));
87 tcph->source = oth->dest; 86 tcph->source = oth->dest;
@@ -129,6 +128,14 @@ static void send_reset(struct sk_buff *oldskb, int hook)
129static inline void send_unreach(struct sk_buff *skb_in, int code) 128static inline void send_unreach(struct sk_buff *skb_in, int code)
130{ 129{
131 icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0); 130 icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
131#ifdef CONFIG_IP_NF_TARGET_REJECT_SKERR
132 if (skb_in->sk) {
133 skb_in->sk->sk_err = icmp_err_convert[code].errno;
134 skb_in->sk->sk_error_report(skb_in->sk);
135 pr_debug("ipt_REJECT: sk_err=%d for skb=%p sk=%p\n",
136 skb_in->sk->sk_err, skb_in, skb_in->sk);
137 }
138#endif
132} 139}
133 140
134static unsigned int 141static unsigned int
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index b5ef3cba225..446e0f467a1 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -65,7 +65,7 @@ static unsigned int flushtimeout = 10;
65module_param(flushtimeout, uint, 0600); 65module_param(flushtimeout, uint, 0600);
66MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths of a second)"); 66MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths of a second)");
67 67
68static bool nflog = true; 68static int nflog = 1;
69module_param(nflog, bool, 0400); 69module_param(nflog, bool, 0400);
70MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); 70MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
71 71
@@ -135,8 +135,10 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size)
135 * due to slab allocator restrictions */ 135 * due to slab allocator restrictions */
136 136
137 n = max(size, nlbufsiz); 137 n = max(size, nlbufsiz);
138 skb = alloc_skb(n, GFP_ATOMIC | __GFP_NOWARN); 138 skb = alloc_skb(n, GFP_ATOMIC);
139 if (!skb) { 139 if (!skb) {
140 pr_debug("cannot alloc whole buffer %ub!\n", n);
141
140 if (n > size) { 142 if (n > size) {
141 /* try to allocate only as much as we need for 143 /* try to allocate only as much as we need for
142 * current packet */ 144 * current packet */
@@ -196,15 +198,12 @@ static void ipt_ulog_packet(unsigned int hooknum,
196 198
197 pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold); 199 pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold);
198 200
199 nlh = nlmsg_put(ub->skb, 0, ub->qlen, ULOG_NL_EVENT, 201 /* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */
200 sizeof(*pm)+copy_len, 0); 202 nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT,
201 if (!nlh) { 203 sizeof(*pm)+copy_len);
202 pr_debug("error during nlmsg_put\n");
203 goto out_unlock;
204 }
205 ub->qlen++; 204 ub->qlen++;
206 205
207 pm = nlmsg_data(nlh); 206 pm = NLMSG_DATA(nlh);
208 207
209 /* We might not have a timestamp, get one */ 208 /* We might not have a timestamp, get one */
210 if (skb->tstamp.tv64 == 0) 209 if (skb->tstamp.tv64 == 0)
@@ -264,11 +263,13 @@ static void ipt_ulog_packet(unsigned int hooknum,
264 nlh->nlmsg_type = NLMSG_DONE; 263 nlh->nlmsg_type = NLMSG_DONE;
265 ulog_send(groupnum); 264 ulog_send(groupnum);
266 } 265 }
267out_unlock: 266
268 spin_unlock_bh(&ulog_lock); 267 spin_unlock_bh(&ulog_lock);
269 268
270 return; 269 return;
271 270
271nlmsg_failure:
272 pr_debug("error during NLMSG_PUT\n");
272alloc_failure: 273alloc_failure:
273 pr_debug("Error building netlink message\n"); 274 pr_debug("Error building netlink message\n");
274 spin_unlock_bh(&ulog_lock); 275 spin_unlock_bh(&ulog_lock);
@@ -381,9 +382,6 @@ static struct nf_logger ipt_ulog_logger __read_mostly = {
381static int __init ulog_tg_init(void) 382static int __init ulog_tg_init(void)
382{ 383{
383 int ret, i; 384 int ret, i;
384 struct netlink_kernel_cfg cfg = {
385 .groups = ULOG_MAXNLGROUPS,
386 };
387 385
388 pr_debug("init module\n"); 386 pr_debug("init module\n");
389 387
@@ -396,7 +394,9 @@ static int __init ulog_tg_init(void)
396 for (i = 0; i < ULOG_MAXNLGROUPS; i++) 394 for (i = 0; i < ULOG_MAXNLGROUPS; i++)
397 setup_timer(&ulog_buffers[i].timer, ulog_timer, i); 395 setup_timer(&ulog_buffers[i].timer, ulog_timer, i);
398 396
399 nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, &cfg); 397 nflognl = netlink_kernel_create(&init_net,
398 NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL,
399 NULL, THIS_MODULE);
400 if (!nflognl) 400 if (!nflognl)
401 return -ENOMEM; 401 return -ENOMEM;
402 402
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
deleted file mode 100644
index c30130062cd..00000000000
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ /dev/null
@@ -1,141 +0,0 @@
1/*
2 * Copyright (c) 2011 Florian Westphal <fw@strlen.de>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * based on fib_frontend.c; Author: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9 */
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11#include <linux/module.h>
12#include <linux/skbuff.h>
13#include <linux/netdevice.h>
14#include <linux/ip.h>
15#include <net/ip.h>
16#include <net/ip_fib.h>
17#include <net/route.h>
18
19#include <linux/netfilter/xt_rpfilter.h>
20#include <linux/netfilter/x_tables.h>
21
22MODULE_LICENSE("GPL");
23MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
24MODULE_DESCRIPTION("iptables: ipv4 reverse path filter match");
25
26/* don't try to find route from mcast/bcast/zeronet */
27static __be32 rpfilter_get_saddr(__be32 addr)
28{
29 if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr) ||
30 ipv4_is_zeronet(addr))
31 return 0;
32 return addr;
33}
34
35static bool rpfilter_lookup_reverse(struct flowi4 *fl4,
36 const struct net_device *dev, u8 flags)
37{
38 struct fib_result res;
39 bool dev_match;
40 struct net *net = dev_net(dev);
41 int ret __maybe_unused;
42
43 if (fib_lookup(net, fl4, &res))
44 return false;
45
46 if (res.type != RTN_UNICAST) {
47 if (res.type != RTN_LOCAL || !(flags & XT_RPFILTER_ACCEPT_LOCAL))
48 return false;
49 }
50 dev_match = false;
51#ifdef CONFIG_IP_ROUTE_MULTIPATH
52 for (ret = 0; ret < res.fi->fib_nhs; ret++) {
53 struct fib_nh *nh = &res.fi->fib_nh[ret];
54
55 if (nh->nh_dev == dev) {
56 dev_match = true;
57 break;
58 }
59 }
60#else
61 if (FIB_RES_DEV(res) == dev)
62 dev_match = true;
63#endif
64 if (dev_match || flags & XT_RPFILTER_LOOSE)
65 return FIB_RES_NH(res).nh_scope <= RT_SCOPE_HOST;
66 return dev_match;
67}
68
69static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
70{
71 const struct xt_rpfilter_info *info;
72 const struct iphdr *iph;
73 struct flowi4 flow;
74 bool invert;
75
76 info = par->matchinfo;
77 invert = info->flags & XT_RPFILTER_INVERT;
78
79 if (par->in->flags & IFF_LOOPBACK)
80 return true ^ invert;
81
82 iph = ip_hdr(skb);
83 if (ipv4_is_multicast(iph->daddr)) {
84 if (ipv4_is_zeronet(iph->saddr))
85 return ipv4_is_local_multicast(iph->daddr) ^ invert;
86 flow.flowi4_iif = 0;
87 } else {
88 flow.flowi4_iif = LOOPBACK_IFINDEX;
89 }
90
91 flow.daddr = iph->saddr;
92 flow.saddr = rpfilter_get_saddr(iph->daddr);
93 flow.flowi4_oif = 0;
94 flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
95 flow.flowi4_tos = RT_TOS(iph->tos);
96 flow.flowi4_scope = RT_SCOPE_UNIVERSE;
97
98 return rpfilter_lookup_reverse(&flow, par->in, info->flags) ^ invert;
99}
100
101static int rpfilter_check(const struct xt_mtchk_param *par)
102{
103 const struct xt_rpfilter_info *info = par->matchinfo;
104 unsigned int options = ~XT_RPFILTER_OPTION_MASK;
105 if (info->flags & options) {
106 pr_info("unknown options encountered");
107 return -EINVAL;
108 }
109
110 if (strcmp(par->table, "mangle") != 0 &&
111 strcmp(par->table, "raw") != 0) {
112 pr_info("match only valid in the \'raw\' "
113 "or \'mangle\' tables, not \'%s\'.\n", par->table);
114 return -EINVAL;
115 }
116
117 return 0;
118}
119
120static struct xt_match rpfilter_mt_reg __read_mostly = {
121 .name = "rpfilter",
122 .family = NFPROTO_IPV4,
123 .checkentry = rpfilter_check,
124 .match = rpfilter_mt,
125 .matchsize = sizeof(struct xt_rpfilter_info),
126 .hooks = (1 << NF_INET_PRE_ROUTING),
127 .me = THIS_MODULE
128};
129
130static int __init rpfilter_mt_init(void)
131{
132 return xt_register_match(&rpfilter_mt_reg);
133}
134
135static void __exit rpfilter_mt_exit(void)
136{
137 xt_unregister_match(&rpfilter_mt_reg);
138}
139
140module_init(rpfilter_mt_init);
141module_exit(rpfilter_mt_exit);
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 6b3da5cf54e..c37641e819f 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -52,7 +52,7 @@ iptable_filter_hook(unsigned int hook, struct sk_buff *skb,
52static struct nf_hook_ops *filter_ops __read_mostly; 52static struct nf_hook_ops *filter_ops __read_mostly;
53 53
54/* Default to forward because I got too much mail already. */ 54/* Default to forward because I got too much mail already. */
55static bool forward = true; 55static int forward = NF_ACCEPT;
56module_param(forward, bool, 0000); 56module_param(forward, bool, 0000);
57 57
58static int __net_init iptable_filter_net_init(struct net *net) 58static int __net_init iptable_filter_net_init(struct net *net)
@@ -64,12 +64,14 @@ static int __net_init iptable_filter_net_init(struct net *net)
64 return -ENOMEM; 64 return -ENOMEM;
65 /* Entry 1 is the FORWARD hook */ 65 /* Entry 1 is the FORWARD hook */
66 ((struct ipt_standard *)repl->entries)[1].target.verdict = 66 ((struct ipt_standard *)repl->entries)[1].target.verdict =
67 forward ? -NF_ACCEPT - 1 : -NF_DROP - 1; 67 -forward - 1;
68 68
69 net->ipv4.iptable_filter = 69 net->ipv4.iptable_filter =
70 ipt_register_table(net, &packet_filter, repl); 70 ipt_register_table(net, &packet_filter, repl);
71 kfree(repl); 71 kfree(repl);
72 return PTR_RET(net->ipv4.iptable_filter); 72 if (IS_ERR(net->ipv4.iptable_filter))
73 return PTR_ERR(net->ipv4.iptable_filter);
74 return 0;
73} 75}
74 76
75static void __net_exit iptable_filter_net_exit(struct net *net) 77static void __net_exit iptable_filter_net_exit(struct net *net)
@@ -86,6 +88,11 @@ static int __init iptable_filter_init(void)
86{ 88{
87 int ret; 89 int ret;
88 90
91 if (forward < 0 || forward > NF_MAX_VERDICT) {
92 pr_err("iptables forward must be 0 or 1\n");
93 return -EINVAL;
94 }
95
89 ret = register_pernet_subsys(&iptable_filter_net_ops); 96 ret = register_pernet_subsys(&iptable_filter_net_ops);
90 if (ret < 0) 97 if (ret < 0)
91 return ret; 98 return ret;
@@ -94,10 +101,14 @@ static int __init iptable_filter_init(void)
94 filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook); 101 filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook);
95 if (IS_ERR(filter_ops)) { 102 if (IS_ERR(filter_ops)) {
96 ret = PTR_ERR(filter_ops); 103 ret = PTR_ERR(filter_ops);
97 unregister_pernet_subsys(&iptable_filter_net_ops); 104 goto cleanup_table;
98 } 105 }
99 106
100 return ret; 107 return ret;
108
109 cleanup_table:
110 unregister_pernet_subsys(&iptable_filter_net_ops);
111 return ret;
101} 112}
102 113
103static void __exit iptable_filter_fini(void) 114static void __exit iptable_filter_fini(void)
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 85d88f20644..aef5d1fbe77 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -104,7 +104,9 @@ static int __net_init iptable_mangle_net_init(struct net *net)
104 net->ipv4.iptable_mangle = 104 net->ipv4.iptable_mangle =
105 ipt_register_table(net, &packet_mangler, repl); 105 ipt_register_table(net, &packet_mangler, repl);
106 kfree(repl); 106 kfree(repl);
107 return PTR_RET(net->ipv4.iptable_mangle); 107 if (IS_ERR(net->ipv4.iptable_mangle))
108 return PTR_ERR(net->ipv4.iptable_mangle);
109 return 0;
108} 110}
109 111
110static void __net_exit iptable_mangle_net_exit(struct net *net) 112static void __net_exit iptable_mangle_net_exit(struct net *net)
@@ -129,10 +131,14 @@ static int __init iptable_mangle_init(void)
129 mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook); 131 mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook);
130 if (IS_ERR(mangle_ops)) { 132 if (IS_ERR(mangle_ops)) {
131 ret = PTR_ERR(mangle_ops); 133 ret = PTR_ERR(mangle_ops);
132 unregister_pernet_subsys(&iptable_mangle_net_ops); 134 goto cleanup_table;
133 } 135 }
134 136
135 return ret; 137 return ret;
138
139 cleanup_table:
140 unregister_pernet_subsys(&iptable_mangle_net_ops);
141 return ret;
136} 142}
137 143
138static void __exit iptable_mangle_fini(void) 144static void __exit iptable_mangle_fini(void)
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
deleted file mode 100644
index eeaff7e4acb..00000000000
--- a/net/ipv4/netfilter/iptable_nat.c
+++ /dev/null
@@ -1,329 +0,0 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
3 * (C) 2011 Patrick McHardy <kaber@trash.net>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 */
9
10#include <linux/module.h>
11#include <linux/netfilter.h>
12#include <linux/netfilter_ipv4.h>
13#include <linux/netfilter_ipv4/ip_tables.h>
14#include <linux/ip.h>
15#include <net/ip.h>
16
17#include <net/netfilter/nf_nat.h>
18#include <net/netfilter/nf_nat_core.h>
19#include <net/netfilter/nf_nat_l3proto.h>
20
21static const struct xt_table nf_nat_ipv4_table = {
22 .name = "nat",
23 .valid_hooks = (1 << NF_INET_PRE_ROUTING) |
24 (1 << NF_INET_POST_ROUTING) |
25 (1 << NF_INET_LOCAL_OUT) |
26 (1 << NF_INET_LOCAL_IN),
27 .me = THIS_MODULE,
28 .af = NFPROTO_IPV4,
29};
30
31static unsigned int alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
32{
33 /* Force range to this IP; let proto decide mapping for
34 * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
35 */
36 struct nf_nat_range range;
37
38 range.flags = 0;
39 pr_debug("Allocating NULL binding for %p (%pI4)\n", ct,
40 HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ?
41 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip :
42 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
43
44 return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
45}
46
47static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum,
48 const struct net_device *in,
49 const struct net_device *out,
50 struct nf_conn *ct)
51{
52 struct net *net = nf_ct_net(ct);
53 unsigned int ret;
54
55 ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table);
56 if (ret == NF_ACCEPT) {
57 if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum)))
58 ret = alloc_null_binding(ct, hooknum);
59 }
60 return ret;
61}
62
63static unsigned int
64nf_nat_ipv4_fn(unsigned int hooknum,
65 struct sk_buff *skb,
66 const struct net_device *in,
67 const struct net_device *out,
68 int (*okfn)(struct sk_buff *))
69{
70 struct nf_conn *ct;
71 enum ip_conntrack_info ctinfo;
72 struct nf_conn_nat *nat;
73 /* maniptype == SRC for postrouting. */
74 enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum);
75
76 /* We never see fragments: conntrack defrags on pre-routing
77 * and local-out, and nf_nat_out protects post-routing.
78 */
79 NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb)));
80
81 ct = nf_ct_get(skb, &ctinfo);
82 /* Can't track? It's not due to stress, or conntrack would
83 * have dropped it. Hence it's the user's responsibilty to
84 * packet filter it out, or implement conntrack/NAT for that
85 * protocol. 8) --RR
86 */
87 if (!ct)
88 return NF_ACCEPT;
89
90 /* Don't try to NAT if this packet is not conntracked */
91 if (nf_ct_is_untracked(ct))
92 return NF_ACCEPT;
93
94 nat = nfct_nat(ct);
95 if (!nat) {
96 /* NAT module was loaded late. */
97 if (nf_ct_is_confirmed(ct))
98 return NF_ACCEPT;
99 nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
100 if (nat == NULL) {
101 pr_debug("failed to add NAT extension\n");
102 return NF_ACCEPT;
103 }
104 }
105
106 switch (ctinfo) {
107 case IP_CT_RELATED:
108 case IP_CT_RELATED_REPLY:
109 if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
110 if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
111 hooknum))
112 return NF_DROP;
113 else
114 return NF_ACCEPT;
115 }
116 /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
117 case IP_CT_NEW:
118 /* Seen it before? This can happen for loopback, retrans,
119 * or local packets.
120 */
121 if (!nf_nat_initialized(ct, maniptype)) {
122 unsigned int ret;
123
124 ret = nf_nat_rule_find(skb, hooknum, in, out, ct);
125 if (ret != NF_ACCEPT)
126 return ret;
127 } else {
128 pr_debug("Already setup manip %s for ct %p\n",
129 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
130 ct);
131 if (nf_nat_oif_changed(hooknum, ctinfo, nat, out))
132 goto oif_changed;
133 }
134 break;
135
136 default:
137 /* ESTABLISHED */
138 NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
139 ctinfo == IP_CT_ESTABLISHED_REPLY);
140 if (nf_nat_oif_changed(hooknum, ctinfo, nat, out))
141 goto oif_changed;
142 }
143
144 return nf_nat_packet(ct, ctinfo, hooknum, skb);
145
146oif_changed:
147 nf_ct_kill_acct(ct, ctinfo, skb);
148 return NF_DROP;
149}
150
151static unsigned int
152nf_nat_ipv4_in(unsigned int hooknum,
153 struct sk_buff *skb,
154 const struct net_device *in,
155 const struct net_device *out,
156 int (*okfn)(struct sk_buff *))
157{
158 unsigned int ret;
159 __be32 daddr = ip_hdr(skb)->daddr;
160
161 ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
162 if (ret != NF_DROP && ret != NF_STOLEN &&
163 daddr != ip_hdr(skb)->daddr)
164 skb_dst_drop(skb);
165
166 return ret;
167}
168
169static unsigned int
170nf_nat_ipv4_out(unsigned int hooknum,
171 struct sk_buff *skb,
172 const struct net_device *in,
173 const struct net_device *out,
174 int (*okfn)(struct sk_buff *))
175{
176#ifdef CONFIG_XFRM
177 const struct nf_conn *ct;
178 enum ip_conntrack_info ctinfo;
179#endif
180 unsigned int ret;
181
182 /* root is playing with raw sockets. */
183 if (skb->len < sizeof(struct iphdr) ||
184 ip_hdrlen(skb) < sizeof(struct iphdr))
185 return NF_ACCEPT;
186
187 ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
188#ifdef CONFIG_XFRM
189 if (ret != NF_DROP && ret != NF_STOLEN &&
190 !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
191 (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
192 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
193
194 if ((ct->tuplehash[dir].tuple.src.u3.ip !=
195 ct->tuplehash[!dir].tuple.dst.u3.ip) ||
196 (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
197 ct->tuplehash[dir].tuple.src.u.all !=
198 ct->tuplehash[!dir].tuple.dst.u.all))
199 if (nf_xfrm_me_harder(skb, AF_INET) < 0)
200 ret = NF_DROP;
201 }
202#endif
203 return ret;
204}
205
206static unsigned int
207nf_nat_ipv4_local_fn(unsigned int hooknum,
208 struct sk_buff *skb,
209 const struct net_device *in,
210 const struct net_device *out,
211 int (*okfn)(struct sk_buff *))
212{
213 const struct nf_conn *ct;
214 enum ip_conntrack_info ctinfo;
215 unsigned int ret;
216
217 /* root is playing with raw sockets. */
218 if (skb->len < sizeof(struct iphdr) ||
219 ip_hdrlen(skb) < sizeof(struct iphdr))
220 return NF_ACCEPT;
221
222 ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
223 if (ret != NF_DROP && ret != NF_STOLEN &&
224 (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
225 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
226
227 if (ct->tuplehash[dir].tuple.dst.u3.ip !=
228 ct->tuplehash[!dir].tuple.src.u3.ip) {
229 if (ip_route_me_harder(skb, RTN_UNSPEC))
230 ret = NF_DROP;
231 }
232#ifdef CONFIG_XFRM
233 else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
234 ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
235 ct->tuplehash[dir].tuple.dst.u.all !=
236 ct->tuplehash[!dir].tuple.src.u.all)
237 if (nf_xfrm_me_harder(skb, AF_INET) < 0)
238 ret = NF_DROP;
239#endif
240 }
241 return ret;
242}
243
244static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
245 /* Before packet filtering, change destination */
246 {
247 .hook = nf_nat_ipv4_in,
248 .owner = THIS_MODULE,
249 .pf = NFPROTO_IPV4,
250 .hooknum = NF_INET_PRE_ROUTING,
251 .priority = NF_IP_PRI_NAT_DST,
252 },
253 /* After packet filtering, change source */
254 {
255 .hook = nf_nat_ipv4_out,
256 .owner = THIS_MODULE,
257 .pf = NFPROTO_IPV4,
258 .hooknum = NF_INET_POST_ROUTING,
259 .priority = NF_IP_PRI_NAT_SRC,
260 },
261 /* Before packet filtering, change destination */
262 {
263 .hook = nf_nat_ipv4_local_fn,
264 .owner = THIS_MODULE,
265 .pf = NFPROTO_IPV4,
266 .hooknum = NF_INET_LOCAL_OUT,
267 .priority = NF_IP_PRI_NAT_DST,
268 },
269 /* After packet filtering, change source */
270 {
271 .hook = nf_nat_ipv4_fn,
272 .owner = THIS_MODULE,
273 .pf = NFPROTO_IPV4,
274 .hooknum = NF_INET_LOCAL_IN,
275 .priority = NF_IP_PRI_NAT_SRC,
276 },
277};
278
279static int __net_init iptable_nat_net_init(struct net *net)
280{
281 struct ipt_replace *repl;
282
283 repl = ipt_alloc_initial_table(&nf_nat_ipv4_table);
284 if (repl == NULL)
285 return -ENOMEM;
286 net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl);
287 kfree(repl);
288 return PTR_RET(net->ipv4.nat_table);
289}
290
291static void __net_exit iptable_nat_net_exit(struct net *net)
292{
293 ipt_unregister_table(net, net->ipv4.nat_table);
294}
295
296static struct pernet_operations iptable_nat_net_ops = {
297 .init = iptable_nat_net_init,
298 .exit = iptable_nat_net_exit,
299};
300
301static int __init iptable_nat_init(void)
302{
303 int err;
304
305 err = register_pernet_subsys(&iptable_nat_net_ops);
306 if (err < 0)
307 goto err1;
308
309 err = nf_register_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
310 if (err < 0)
311 goto err2;
312 return 0;
313
314err2:
315 unregister_pernet_subsys(&iptable_nat_net_ops);
316err1:
317 return err;
318}
319
320static void __exit iptable_nat_exit(void)
321{
322 nf_unregister_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
323 unregister_pernet_subsys(&iptable_nat_net_ops);
324}
325
326module_init(iptable_nat_init);
327module_exit(iptable_nat_exit);
328
329MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 03d9696d3c6..07fb710cd72 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -48,7 +48,9 @@ static int __net_init iptable_raw_net_init(struct net *net)
48 net->ipv4.iptable_raw = 48 net->ipv4.iptable_raw =
49 ipt_register_table(net, &packet_raw, repl); 49 ipt_register_table(net, &packet_raw, repl);
50 kfree(repl); 50 kfree(repl);
51 return PTR_RET(net->ipv4.iptable_raw); 51 if (IS_ERR(net->ipv4.iptable_raw))
52 return PTR_ERR(net->ipv4.iptable_raw);
53 return 0;
52} 54}
53 55
54static void __net_exit iptable_raw_net_exit(struct net *net) 56static void __net_exit iptable_raw_net_exit(struct net *net)
@@ -73,10 +75,14 @@ static int __init iptable_raw_init(void)
73 rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook); 75 rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook);
74 if (IS_ERR(rawtable_ops)) { 76 if (IS_ERR(rawtable_ops)) {
75 ret = PTR_ERR(rawtable_ops); 77 ret = PTR_ERR(rawtable_ops);
76 unregister_pernet_subsys(&iptable_raw_net_ops); 78 goto cleanup_table;
77 } 79 }
78 80
79 return ret; 81 return ret;
82
83 cleanup_table:
84 unregister_pernet_subsys(&iptable_raw_net_ops);
85 return ret;
80} 86}
81 87
82static void __exit iptable_raw_fini(void) 88static void __exit iptable_raw_fini(void)
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index b283d8e2601..be45bdc4c60 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -66,7 +66,10 @@ static int __net_init iptable_security_net_init(struct net *net)
66 net->ipv4.iptable_security = 66 net->ipv4.iptable_security =
67 ipt_register_table(net, &security_table, repl); 67 ipt_register_table(net, &security_table, repl);
68 kfree(repl); 68 kfree(repl);
69 return PTR_RET(net->ipv4.iptable_security); 69 if (IS_ERR(net->ipv4.iptable_security))
70 return PTR_ERR(net->ipv4.iptable_security);
71
72 return 0;
70} 73}
71 74
72static void __net_exit iptable_security_net_exit(struct net *net) 75static void __net_exit iptable_security_net_exit(struct net *net)
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index fcdd0c2406e..de9da21113a 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -29,6 +29,11 @@
29#include <net/netfilter/ipv4/nf_defrag_ipv4.h> 29#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
30#include <net/netfilter/nf_log.h> 30#include <net/netfilter/nf_log.h>
31 31
32int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb,
33 struct nf_conn *ct,
34 enum ip_conntrack_info ctinfo);
35EXPORT_SYMBOL_GPL(nf_nat_seq_adjust_hook);
36
32static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, 37static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
33 struct nf_conntrack_tuple *tuple) 38 struct nf_conntrack_tuple *tuple)
34{ 39{
@@ -69,32 +74,24 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
69 74
70 iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); 75 iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
71 if (iph == NULL) 76 if (iph == NULL)
72 return -NF_ACCEPT; 77 return -NF_DROP;
73 78
74 /* Conntrack defragments packets, we might still see fragments 79 /* Conntrack defragments packets, we might still see fragments
75 * inside ICMP packets though. */ 80 * inside ICMP packets though. */
76 if (iph->frag_off & htons(IP_OFFSET)) 81 if (iph->frag_off & htons(IP_OFFSET))
77 return -NF_ACCEPT; 82 return -NF_DROP;
78 83
79 *dataoff = nhoff + (iph->ihl << 2); 84 *dataoff = nhoff + (iph->ihl << 2);
80 *protonum = iph->protocol; 85 *protonum = iph->protocol;
81 86
82 /* Check bogus IP headers */
83 if (*dataoff > skb->len) {
84 pr_debug("nf_conntrack_ipv4: bogus IPv4 packet: "
85 "nhoff %u, ihl %u, skblen %u\n",
86 nhoff, iph->ihl << 2, skb->len);
87 return -NF_ACCEPT;
88 }
89
90 return NF_ACCEPT; 87 return NF_ACCEPT;
91} 88}
92 89
93static unsigned int ipv4_helper(unsigned int hooknum, 90static unsigned int ipv4_confirm(unsigned int hooknum,
94 struct sk_buff *skb, 91 struct sk_buff *skb,
95 const struct net_device *in, 92 const struct net_device *in,
96 const struct net_device *out, 93 const struct net_device *out,
97 int (*okfn)(struct sk_buff *)) 94 int (*okfn)(struct sk_buff *))
98{ 95{
99 struct nf_conn *ct; 96 struct nf_conn *ct;
100 enum ip_conntrack_info ctinfo; 97 enum ip_conntrack_info ctinfo;
@@ -105,38 +102,24 @@ static unsigned int ipv4_helper(unsigned int hooknum,
105 /* This is where we call the helper: as the packet goes out. */ 102 /* This is where we call the helper: as the packet goes out. */
106 ct = nf_ct_get(skb, &ctinfo); 103 ct = nf_ct_get(skb, &ctinfo);
107 if (!ct || ctinfo == IP_CT_RELATED_REPLY) 104 if (!ct || ctinfo == IP_CT_RELATED_REPLY)
108 return NF_ACCEPT; 105 goto out;
109 106
110 help = nfct_help(ct); 107 help = nfct_help(ct);
111 if (!help) 108 if (!help)
112 return NF_ACCEPT; 109 goto out;
113 110
114 /* rcu_read_lock()ed by nf_hook_slow */ 111 /* rcu_read_lock()ed by nf_hook_slow */
115 helper = rcu_dereference(help->helper); 112 helper = rcu_dereference(help->helper);
116 if (!helper) 113 if (!helper)
117 return NF_ACCEPT; 114 goto out;
118 115
119 ret = helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb), 116 ret = helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb),
120 ct, ctinfo); 117 ct, ctinfo);
121 if (ret != NF_ACCEPT && (ret & NF_VERDICT_MASK) != NF_QUEUE) { 118 if (ret != NF_ACCEPT) {
122 nf_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, NULL, 119 nf_log_packet(NFPROTO_IPV4, hooknum, skb, in, out, NULL,
123 "nf_ct_%s: dropping packet", helper->name); 120 "nf_ct_%s: dropping packet", helper->name);
121 return ret;
124 } 122 }
125 return ret;
126}
127
128static unsigned int ipv4_confirm(unsigned int hooknum,
129 struct sk_buff *skb,
130 const struct net_device *in,
131 const struct net_device *out,
132 int (*okfn)(struct sk_buff *))
133{
134 struct nf_conn *ct;
135 enum ip_conntrack_info ctinfo;
136
137 ct = nf_ct_get(skb, &ctinfo);
138 if (!ct || ctinfo == IP_CT_RELATED_REPLY)
139 goto out;
140 123
141 /* adjust seqs for loopback traffic only in outgoing direction */ 124 /* adjust seqs for loopback traffic only in outgoing direction */
142 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && 125 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
@@ -144,8 +127,7 @@ static unsigned int ipv4_confirm(unsigned int hooknum,
144 typeof(nf_nat_seq_adjust_hook) seq_adjust; 127 typeof(nf_nat_seq_adjust_hook) seq_adjust;
145 128
146 seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook); 129 seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook);
147 if (!seq_adjust || 130 if (!seq_adjust || !seq_adjust(skb, ct, ctinfo)) {
148 !seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {
149 NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); 131 NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
150 return NF_DROP; 132 return NF_DROP;
151 } 133 }
@@ -195,13 +177,6 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
195 .priority = NF_IP_PRI_CONNTRACK, 177 .priority = NF_IP_PRI_CONNTRACK,
196 }, 178 },
197 { 179 {
198 .hook = ipv4_helper,
199 .owner = THIS_MODULE,
200 .pf = NFPROTO_IPV4,
201 .hooknum = NF_INET_POST_ROUTING,
202 .priority = NF_IP_PRI_CONNTRACK_HELPER,
203 },
204 {
205 .hook = ipv4_confirm, 180 .hook = ipv4_confirm,
206 .owner = THIS_MODULE, 181 .owner = THIS_MODULE,
207 .pf = NFPROTO_IPV4, 182 .pf = NFPROTO_IPV4,
@@ -209,13 +184,6 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
209 .priority = NF_IP_PRI_CONNTRACK_CONFIRM, 184 .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
210 }, 185 },
211 { 186 {
212 .hook = ipv4_helper,
213 .owner = THIS_MODULE,
214 .pf = NFPROTO_IPV4,
215 .hooknum = NF_INET_LOCAL_IN,
216 .priority = NF_IP_PRI_CONNTRACK_HELPER,
217 },
218 {
219 .hook = ipv4_confirm, 187 .hook = ipv4_confirm,
220 .owner = THIS_MODULE, 188 .owner = THIS_MODULE,
221 .pf = NFPROTO_IPV4, 189 .pf = NFPROTO_IPV4,
@@ -231,30 +199,35 @@ static int log_invalid_proto_max = 255;
231static ctl_table ip_ct_sysctl_table[] = { 199static ctl_table ip_ct_sysctl_table[] = {
232 { 200 {
233 .procname = "ip_conntrack_max", 201 .procname = "ip_conntrack_max",
202 .data = &nf_conntrack_max,
234 .maxlen = sizeof(int), 203 .maxlen = sizeof(int),
235 .mode = 0644, 204 .mode = 0644,
236 .proc_handler = proc_dointvec, 205 .proc_handler = proc_dointvec,
237 }, 206 },
238 { 207 {
239 .procname = "ip_conntrack_count", 208 .procname = "ip_conntrack_count",
209 .data = &init_net.ct.count,
240 .maxlen = sizeof(int), 210 .maxlen = sizeof(int),
241 .mode = 0444, 211 .mode = 0444,
242 .proc_handler = proc_dointvec, 212 .proc_handler = proc_dointvec,
243 }, 213 },
244 { 214 {
245 .procname = "ip_conntrack_buckets", 215 .procname = "ip_conntrack_buckets",
216 .data = &init_net.ct.htable_size,
246 .maxlen = sizeof(unsigned int), 217 .maxlen = sizeof(unsigned int),
247 .mode = 0444, 218 .mode = 0444,
248 .proc_handler = proc_dointvec, 219 .proc_handler = proc_dointvec,
249 }, 220 },
250 { 221 {
251 .procname = "ip_conntrack_checksum", 222 .procname = "ip_conntrack_checksum",
223 .data = &init_net.ct.sysctl_checksum,
252 .maxlen = sizeof(int), 224 .maxlen = sizeof(int),
253 .mode = 0644, 225 .mode = 0644,
254 .proc_handler = proc_dointvec, 226 .proc_handler = proc_dointvec,
255 }, 227 },
256 { 228 {
257 .procname = "ip_conntrack_log_invalid", 229 .procname = "ip_conntrack_log_invalid",
230 .data = &init_net.ct.sysctl_log_invalid,
258 .maxlen = sizeof(unsigned int), 231 .maxlen = sizeof(unsigned int),
259 .mode = 0644, 232 .mode = 0644,
260 .proc_handler = proc_dointvec_minmax, 233 .proc_handler = proc_dointvec_minmax,
@@ -330,9 +303,8 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
330static int ipv4_tuple_to_nlattr(struct sk_buff *skb, 303static int ipv4_tuple_to_nlattr(struct sk_buff *skb,
331 const struct nf_conntrack_tuple *tuple) 304 const struct nf_conntrack_tuple *tuple)
332{ 305{
333 if (nla_put_be32(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) || 306 NLA_PUT_BE32(skb, CTA_IP_V4_SRC, tuple->src.u3.ip);
334 nla_put_be32(skb, CTA_IP_V4_DST, tuple->dst.u3.ip)) 307 NLA_PUT_BE32(skb, CTA_IP_V4_DST, tuple->dst.u3.ip);
335 goto nla_put_failure;
336 return 0; 308 return 0;
337 309
338nla_put_failure: 310nla_put_failure:
@@ -370,25 +342,6 @@ static struct nf_sockopt_ops so_getorigdst = {
370 .owner = THIS_MODULE, 342 .owner = THIS_MODULE,
371}; 343};
372 344
373static int ipv4_init_net(struct net *net)
374{
375#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
376 struct nf_ip_net *in = &net->ct.nf_ct_proto;
377 in->ctl_table = kmemdup(ip_ct_sysctl_table,
378 sizeof(ip_ct_sysctl_table),
379 GFP_KERNEL);
380 if (!in->ctl_table)
381 return -ENOMEM;
382
383 in->ctl_table[0].data = &nf_conntrack_max;
384 in->ctl_table[1].data = &net->ct.count;
385 in->ctl_table[2].data = &net->ct.htable_size;
386 in->ctl_table[3].data = &net->ct.sysctl_checksum;
387 in->ctl_table[4].data = &net->ct.sysctl_log_invalid;
388#endif
389 return 0;
390}
391
392struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { 345struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
393 .l3proto = PF_INET, 346 .l3proto = PF_INET,
394 .name = "ipv4", 347 .name = "ipv4",
@@ -403,9 +356,9 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
403 .nla_policy = ipv4_nla_policy, 356 .nla_policy = ipv4_nla_policy,
404#endif 357#endif
405#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) 358#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
406 .ctl_table_path = "net/ipv4/netfilter", 359 .ctl_table_path = nf_net_ipv4_netfilter_sysctl_path,
360 .ctl_table = ip_ct_sysctl_table,
407#endif 361#endif
408 .init_net = ipv4_init_net,
409 .me = THIS_MODULE, 362 .me = THIS_MODULE,
410}; 363};
411 364
@@ -416,65 +369,6 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
416MODULE_ALIAS("ip_conntrack"); 369MODULE_ALIAS("ip_conntrack");
417MODULE_LICENSE("GPL"); 370MODULE_LICENSE("GPL");
418 371
419static int ipv4_net_init(struct net *net)
420{
421 int ret = 0;
422
423 ret = nf_conntrack_l4proto_register(net,
424 &nf_conntrack_l4proto_tcp4);
425 if (ret < 0) {
426 pr_err("nf_conntrack_l4proto_tcp4 :protocol register failed\n");
427 goto out_tcp;
428 }
429 ret = nf_conntrack_l4proto_register(net,
430 &nf_conntrack_l4proto_udp4);
431 if (ret < 0) {
432 pr_err("nf_conntrack_l4proto_udp4 :protocol register failed\n");
433 goto out_udp;
434 }
435 ret = nf_conntrack_l4proto_register(net,
436 &nf_conntrack_l4proto_icmp);
437 if (ret < 0) {
438 pr_err("nf_conntrack_l4proto_icmp4 :protocol register failed\n");
439 goto out_icmp;
440 }
441 ret = nf_conntrack_l3proto_register(net,
442 &nf_conntrack_l3proto_ipv4);
443 if (ret < 0) {
444 pr_err("nf_conntrack_l3proto_ipv4 :protocol register failed\n");
445 goto out_ipv4;
446 }
447 return 0;
448out_ipv4:
449 nf_conntrack_l4proto_unregister(net,
450 &nf_conntrack_l4proto_icmp);
451out_icmp:
452 nf_conntrack_l4proto_unregister(net,
453 &nf_conntrack_l4proto_udp4);
454out_udp:
455 nf_conntrack_l4proto_unregister(net,
456 &nf_conntrack_l4proto_tcp4);
457out_tcp:
458 return ret;
459}
460
461static void ipv4_net_exit(struct net *net)
462{
463 nf_conntrack_l3proto_unregister(net,
464 &nf_conntrack_l3proto_ipv4);
465 nf_conntrack_l4proto_unregister(net,
466 &nf_conntrack_l4proto_icmp);
467 nf_conntrack_l4proto_unregister(net,
468 &nf_conntrack_l4proto_udp4);
469 nf_conntrack_l4proto_unregister(net,
470 &nf_conntrack_l4proto_tcp4);
471}
472
473static struct pernet_operations ipv4_net_ops = {
474 .init = ipv4_net_init,
475 .exit = ipv4_net_exit,
476};
477
478static int __init nf_conntrack_l3proto_ipv4_init(void) 372static int __init nf_conntrack_l3proto_ipv4_init(void)
479{ 373{
480 int ret = 0; 374 int ret = 0;
@@ -488,17 +382,35 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
488 return ret; 382 return ret;
489 } 383 }
490 384
491 ret = register_pernet_subsys(&ipv4_net_ops); 385 ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp4);
492 if (ret < 0) { 386 if (ret < 0) {
493 pr_err("nf_conntrack_ipv4: can't register pernet ops\n"); 387 pr_err("nf_conntrack_ipv4: can't register tcp.\n");
494 goto cleanup_sockopt; 388 goto cleanup_sockopt;
495 } 389 }
496 390
391 ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp4);
392 if (ret < 0) {
393 pr_err("nf_conntrack_ipv4: can't register udp.\n");
394 goto cleanup_tcp;
395 }
396
397 ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmp);
398 if (ret < 0) {
399 pr_err("nf_conntrack_ipv4: can't register icmp.\n");
400 goto cleanup_udp;
401 }
402
403 ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4);
404 if (ret < 0) {
405 pr_err("nf_conntrack_ipv4: can't register ipv4\n");
406 goto cleanup_icmp;
407 }
408
497 ret = nf_register_hooks(ipv4_conntrack_ops, 409 ret = nf_register_hooks(ipv4_conntrack_ops,
498 ARRAY_SIZE(ipv4_conntrack_ops)); 410 ARRAY_SIZE(ipv4_conntrack_ops));
499 if (ret < 0) { 411 if (ret < 0) {
500 pr_err("nf_conntrack_ipv4: can't register hooks.\n"); 412 pr_err("nf_conntrack_ipv4: can't register hooks.\n");
501 goto cleanup_pernet; 413 goto cleanup_ipv4;
502 } 414 }
503#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) 415#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
504 ret = nf_conntrack_ipv4_compat_init(); 416 ret = nf_conntrack_ipv4_compat_init();
@@ -510,8 +422,14 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
510 cleanup_hooks: 422 cleanup_hooks:
511 nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); 423 nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
512#endif 424#endif
513 cleanup_pernet: 425 cleanup_ipv4:
514 unregister_pernet_subsys(&ipv4_net_ops); 426 nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
427 cleanup_icmp:
428 nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp);
429 cleanup_udp:
430 nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4);
431 cleanup_tcp:
432 nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
515 cleanup_sockopt: 433 cleanup_sockopt:
516 nf_unregister_sockopt(&so_getorigdst); 434 nf_unregister_sockopt(&so_getorigdst);
517 return ret; 435 return ret;
@@ -524,7 +442,10 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void)
524 nf_conntrack_ipv4_compat_fini(); 442 nf_conntrack_ipv4_compat_fini();
525#endif 443#endif
526 nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); 444 nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
527 unregister_pernet_subsys(&ipv4_net_ops); 445 nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
446 nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp);
447 nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4);
448 nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
528 nf_unregister_sockopt(&so_getorigdst); 449 nf_unregister_sockopt(&so_getorigdst);
529} 450}
530 451
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 9682b36df38..5585980fce2 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -21,7 +21,6 @@
21#include <net/netfilter/nf_conntrack_expect.h> 21#include <net/netfilter/nf_conntrack_expect.h>
22#include <net/netfilter/nf_conntrack_acct.h> 22#include <net/netfilter/nf_conntrack_acct.h>
23#include <linux/rculist_nulls.h> 23#include <linux/rculist_nulls.h>
24#include <linux/export.h>
25 24
26struct ct_iter_state { 25struct ct_iter_state {
27 struct seq_net_private p; 26 struct seq_net_private p;
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 5241d997ab7..ab5b27a2916 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -23,11 +23,6 @@
23 23
24static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ; 24static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ;
25 25
26static inline struct nf_icmp_net *icmp_pernet(struct net *net)
27{
28 return &net->ct.nf_ct_proto.icmp;
29}
30
31static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, 26static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
32 struct nf_conntrack_tuple *tuple) 27 struct nf_conntrack_tuple *tuple)
33{ 28{
@@ -80,31 +75,25 @@ static int icmp_print_tuple(struct seq_file *s,
80 ntohs(tuple->src.u.icmp.id)); 75 ntohs(tuple->src.u.icmp.id));
81} 76}
82 77
83static unsigned int *icmp_get_timeouts(struct net *net)
84{
85 return &icmp_pernet(net)->timeout;
86}
87
88/* Returns verdict for packet, or -1 for invalid. */ 78/* Returns verdict for packet, or -1 for invalid. */
89static int icmp_packet(struct nf_conn *ct, 79static int icmp_packet(struct nf_conn *ct,
90 const struct sk_buff *skb, 80 const struct sk_buff *skb,
91 unsigned int dataoff, 81 unsigned int dataoff,
92 enum ip_conntrack_info ctinfo, 82 enum ip_conntrack_info ctinfo,
93 u_int8_t pf, 83 u_int8_t pf,
94 unsigned int hooknum, 84 unsigned int hooknum)
95 unsigned int *timeout)
96{ 85{
97 /* Do not immediately delete the connection after the first 86 /* Do not immediately delete the connection after the first
98 successful reply to avoid excessive conntrackd traffic 87 successful reply to avoid excessive conntrackd traffic
99 and also to handle correctly ICMP echo reply duplicates. */ 88 and also to handle correctly ICMP echo reply duplicates. */
100 nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); 89 nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmp_timeout);
101 90
102 return NF_ACCEPT; 91 return NF_ACCEPT;
103} 92}
104 93
105/* Called when a new connection for this protocol found. */ 94/* Called when a new connection for this protocol found. */
106static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb, 95static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
107 unsigned int dataoff, unsigned int *timeouts) 96 unsigned int dataoff)
108{ 97{
109 static const u_int8_t valid_new[] = { 98 static const u_int8_t valid_new[] = {
110 [ICMP_ECHO] = 1, 99 [ICMP_ECHO] = 1,
@@ -233,10 +222,10 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
233static int icmp_tuple_to_nlattr(struct sk_buff *skb, 222static int icmp_tuple_to_nlattr(struct sk_buff *skb,
234 const struct nf_conntrack_tuple *t) 223 const struct nf_conntrack_tuple *t)
235{ 224{
236 if (nla_put_be16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id) || 225 NLA_PUT_BE16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id);
237 nla_put_u8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type) || 226 NLA_PUT_U8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type);
238 nla_put_u8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code)) 227 NLA_PUT_U8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code);
239 goto nla_put_failure; 228
240 return 0; 229 return 0;
241 230
242nla_put_failure: 231nla_put_failure:
@@ -274,50 +263,12 @@ static int icmp_nlattr_tuple_size(void)
274} 263}
275#endif 264#endif
276 265
277#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
278
279#include <linux/netfilter/nfnetlink.h>
280#include <linux/netfilter/nfnetlink_cttimeout.h>
281
282static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[],
283 struct net *net, void *data)
284{
285 unsigned int *timeout = data;
286 struct nf_icmp_net *in = icmp_pernet(net);
287
288 if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) {
289 *timeout =
290 ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ;
291 } else {
292 /* Set default ICMP timeout. */
293 *timeout = in->timeout;
294 }
295 return 0;
296}
297
298static int
299icmp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
300{
301 const unsigned int *timeout = data;
302
303 if (nla_put_be32(skb, CTA_TIMEOUT_ICMP_TIMEOUT, htonl(*timeout / HZ)))
304 goto nla_put_failure;
305 return 0;
306
307nla_put_failure:
308 return -ENOSPC;
309}
310
311static const struct nla_policy
312icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = {
313 [CTA_TIMEOUT_ICMP_TIMEOUT] = { .type = NLA_U32 },
314};
315#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
316
317#ifdef CONFIG_SYSCTL 266#ifdef CONFIG_SYSCTL
267static struct ctl_table_header *icmp_sysctl_header;
318static struct ctl_table icmp_sysctl_table[] = { 268static struct ctl_table icmp_sysctl_table[] = {
319 { 269 {
320 .procname = "nf_conntrack_icmp_timeout", 270 .procname = "nf_conntrack_icmp_timeout",
271 .data = &nf_ct_icmp_timeout,
321 .maxlen = sizeof(unsigned int), 272 .maxlen = sizeof(unsigned int),
322 .mode = 0644, 273 .mode = 0644,
323 .proc_handler = proc_dointvec_jiffies, 274 .proc_handler = proc_dointvec_jiffies,
@@ -328,6 +279,7 @@ static struct ctl_table icmp_sysctl_table[] = {
328static struct ctl_table icmp_compat_sysctl_table[] = { 279static struct ctl_table icmp_compat_sysctl_table[] = {
329 { 280 {
330 .procname = "ip_conntrack_icmp_timeout", 281 .procname = "ip_conntrack_icmp_timeout",
282 .data = &nf_ct_icmp_timeout,
331 .maxlen = sizeof(unsigned int), 283 .maxlen = sizeof(unsigned int),
332 .mode = 0644, 284 .mode = 0644,
333 .proc_handler = proc_dointvec_jiffies, 285 .proc_handler = proc_dointvec_jiffies,
@@ -337,62 +289,6 @@ static struct ctl_table icmp_compat_sysctl_table[] = {
337#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ 289#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
338#endif /* CONFIG_SYSCTL */ 290#endif /* CONFIG_SYSCTL */
339 291
340static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn,
341 struct nf_icmp_net *in)
342{
343#ifdef CONFIG_SYSCTL
344 pn->ctl_table = kmemdup(icmp_sysctl_table,
345 sizeof(icmp_sysctl_table),
346 GFP_KERNEL);
347 if (!pn->ctl_table)
348 return -ENOMEM;
349
350 pn->ctl_table[0].data = &in->timeout;
351#endif
352 return 0;
353}
354
355static int icmp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
356 struct nf_icmp_net *in)
357{
358#ifdef CONFIG_SYSCTL
359#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
360 pn->ctl_compat_table = kmemdup(icmp_compat_sysctl_table,
361 sizeof(icmp_compat_sysctl_table),
362 GFP_KERNEL);
363 if (!pn->ctl_compat_table)
364 return -ENOMEM;
365
366 pn->ctl_compat_table[0].data = &in->timeout;
367#endif
368#endif
369 return 0;
370}
371
372static int icmp_init_net(struct net *net, u_int16_t proto)
373{
374 int ret;
375 struct nf_icmp_net *in = icmp_pernet(net);
376 struct nf_proto_net *pn = &in->pn;
377
378 in->timeout = nf_ct_icmp_timeout;
379
380 ret = icmp_kmemdup_compat_sysctl_table(pn, in);
381 if (ret < 0)
382 return ret;
383
384 ret = icmp_kmemdup_sysctl_table(pn, in);
385 if (ret < 0)
386 nf_ct_kfree_compat_sysctl_table(pn);
387
388 return ret;
389}
390
391static struct nf_proto_net *icmp_get_net_proto(struct net *net)
392{
393 return &net->ct.nf_ct_proto.icmp.pn;
394}
395
396struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = 292struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
397{ 293{
398 .l3proto = PF_INET, 294 .l3proto = PF_INET,
@@ -402,7 +298,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
402 .invert_tuple = icmp_invert_tuple, 298 .invert_tuple = icmp_invert_tuple,
403 .print_tuple = icmp_print_tuple, 299 .print_tuple = icmp_print_tuple,
404 .packet = icmp_packet, 300 .packet = icmp_packet,
405 .get_timeouts = icmp_get_timeouts,
406 .new = icmp_new, 301 .new = icmp_new,
407 .error = icmp_error, 302 .error = icmp_error,
408 .destroy = NULL, 303 .destroy = NULL,
@@ -413,15 +308,11 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
413 .nlattr_to_tuple = icmp_nlattr_to_tuple, 308 .nlattr_to_tuple = icmp_nlattr_to_tuple,
414 .nla_policy = icmp_nla_policy, 309 .nla_policy = icmp_nla_policy,
415#endif 310#endif
416#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) 311#ifdef CONFIG_SYSCTL
417 .ctnl_timeout = { 312 .ctl_table_header = &icmp_sysctl_header,
418 .nlattr_to_obj = icmp_timeout_nlattr_to_obj, 313 .ctl_table = icmp_sysctl_table,
419 .obj_to_nlattr = icmp_timeout_obj_to_nlattr, 314#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
420 .nlattr_max = CTA_TIMEOUT_ICMP_MAX, 315 .ctl_compat_table = icmp_compat_sysctl_table,
421 .obj_size = sizeof(unsigned int), 316#endif
422 .nla_policy = icmp_timeout_nla_policy, 317#endif
423 },
424#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
425 .init_net = icmp_init_net,
426 .get_net_proto = icmp_get_net_proto,
427}; 318};
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 742815518b0..9bb1b8a37a2 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -94,14 +94,14 @@ static struct nf_hook_ops ipv4_defrag_ops[] = {
94 { 94 {
95 .hook = ipv4_conntrack_defrag, 95 .hook = ipv4_conntrack_defrag,
96 .owner = THIS_MODULE, 96 .owner = THIS_MODULE,
97 .pf = NFPROTO_IPV4, 97 .pf = PF_INET,
98 .hooknum = NF_INET_PRE_ROUTING, 98 .hooknum = NF_INET_PRE_ROUTING,
99 .priority = NF_IP_PRI_CONNTRACK_DEFRAG, 99 .priority = NF_IP_PRI_CONNTRACK_DEFRAG,
100 }, 100 },
101 { 101 {
102 .hook = ipv4_conntrack_defrag, 102 .hook = ipv4_conntrack_defrag,
103 .owner = THIS_MODULE, 103 .owner = THIS_MODULE,
104 .pf = NFPROTO_IPV4, 104 .pf = PF_INET,
105 .hooknum = NF_INET_LOCAL_OUT, 105 .hooknum = NF_INET_LOCAL_OUT,
106 .priority = NF_IP_PRI_CONNTRACK_DEFRAG, 106 .priority = NF_IP_PRI_CONNTRACK_DEFRAG,
107 }, 107 },
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index 9c3db10b22d..790f3160e01 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -15,12 +15,13 @@
15 15
16#include <net/netfilter/nf_nat.h> 16#include <net/netfilter/nf_nat.h>
17#include <net/netfilter/nf_nat_helper.h> 17#include <net/netfilter/nf_nat_helper.h>
18#include <net/netfilter/nf_nat_rule.h>
18#include <net/netfilter/nf_conntrack_helper.h> 19#include <net/netfilter/nf_conntrack_helper.h>
19#include <net/netfilter/nf_conntrack_expect.h> 20#include <net/netfilter/nf_conntrack_expect.h>
20#include <linux/netfilter/nf_conntrack_h323.h> 21#include <linux/netfilter/nf_conntrack_h323.h>
21 22
22/****************************************************************************/ 23/****************************************************************************/
23static int set_addr(struct sk_buff *skb, unsigned int protoff, 24static int set_addr(struct sk_buff *skb,
24 unsigned char **data, int dataoff, 25 unsigned char **data, int dataoff,
25 unsigned int addroff, __be32 ip, __be16 port) 26 unsigned int addroff, __be32 ip, __be16 port)
26{ 27{
@@ -39,9 +40,11 @@ static int set_addr(struct sk_buff *skb, unsigned int protoff,
39 40
40 if (ip_hdr(skb)->protocol == IPPROTO_TCP) { 41 if (ip_hdr(skb)->protocol == IPPROTO_TCP) {
41 if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, 42 if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
42 protoff, addroff, sizeof(buf), 43 addroff, sizeof(buf),
43 (char *) &buf, sizeof(buf))) { 44 (char *) &buf, sizeof(buf))) {
44 net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_tcp_packet error\n"); 45 if (net_ratelimit())
46 pr_notice("nf_nat_h323: nf_nat_mangle_tcp_packet"
47 " error\n");
45 return -1; 48 return -1;
46 } 49 }
47 50
@@ -53,9 +56,11 @@ static int set_addr(struct sk_buff *skb, unsigned int protoff,
53 *data = skb->data + ip_hdrlen(skb) + th->doff * 4 + dataoff; 56 *data = skb->data + ip_hdrlen(skb) + th->doff * 4 + dataoff;
54 } else { 57 } else {
55 if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, 58 if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo,
56 protoff, addroff, sizeof(buf), 59 addroff, sizeof(buf),
57 (char *) &buf, sizeof(buf))) { 60 (char *) &buf, sizeof(buf))) {
58 net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_udp_packet error\n"); 61 if (net_ratelimit())
62 pr_notice("nf_nat_h323: nf_nat_mangle_udp_packet"
63 " error\n");
59 return -1; 64 return -1;
60 } 65 }
61 /* nf_nat_mangle_udp_packet uses skb_make_writable() to copy 66 /* nf_nat_mangle_udp_packet uses skb_make_writable() to copy
@@ -68,22 +73,22 @@ static int set_addr(struct sk_buff *skb, unsigned int protoff,
68} 73}
69 74
70/****************************************************************************/ 75/****************************************************************************/
71static int set_h225_addr(struct sk_buff *skb, unsigned int protoff, 76static int set_h225_addr(struct sk_buff *skb,
72 unsigned char **data, int dataoff, 77 unsigned char **data, int dataoff,
73 TransportAddress *taddr, 78 TransportAddress *taddr,
74 union nf_inet_addr *addr, __be16 port) 79 union nf_inet_addr *addr, __be16 port)
75{ 80{
76 return set_addr(skb, protoff, data, dataoff, taddr->ipAddress.ip, 81 return set_addr(skb, data, dataoff, taddr->ipAddress.ip,
77 addr->ip, port); 82 addr->ip, port);
78} 83}
79 84
80/****************************************************************************/ 85/****************************************************************************/
81static int set_h245_addr(struct sk_buff *skb, unsigned protoff, 86static int set_h245_addr(struct sk_buff *skb,
82 unsigned char **data, int dataoff, 87 unsigned char **data, int dataoff,
83 H245_TransportAddress *taddr, 88 H245_TransportAddress *taddr,
84 union nf_inet_addr *addr, __be16 port) 89 union nf_inet_addr *addr, __be16 port)
85{ 90{
86 return set_addr(skb, protoff, data, dataoff, 91 return set_addr(skb, data, dataoff,
87 taddr->unicastAddress.iPAddress.network, 92 taddr->unicastAddress.iPAddress.network,
88 addr->ip, port); 93 addr->ip, port);
89} 94}
@@ -91,10 +96,10 @@ static int set_h245_addr(struct sk_buff *skb, unsigned protoff,
91/****************************************************************************/ 96/****************************************************************************/
92static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct, 97static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
93 enum ip_conntrack_info ctinfo, 98 enum ip_conntrack_info ctinfo,
94 unsigned int protoff, unsigned char **data, 99 unsigned char **data,
95 TransportAddress *taddr, int count) 100 TransportAddress *taddr, int count)
96{ 101{
97 const struct nf_ct_h323_master *info = nfct_help_data(ct); 102 const struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
98 int dir = CTINFO2DIR(ctinfo); 103 int dir = CTINFO2DIR(ctinfo);
99 int i; 104 int i;
100 __be16 port; 105 __be16 port;
@@ -117,8 +122,7 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
117 &addr.ip, port, 122 &addr.ip, port,
118 &ct->tuplehash[!dir].tuple.dst.u3.ip, 123 &ct->tuplehash[!dir].tuple.dst.u3.ip,
119 info->sig_port[!dir]); 124 info->sig_port[!dir]);
120 return set_h225_addr(skb, protoff, data, 0, 125 return set_h225_addr(skb, data, 0, &taddr[i],
121 &taddr[i],
122 &ct->tuplehash[!dir]. 126 &ct->tuplehash[!dir].
123 tuple.dst.u3, 127 tuple.dst.u3,
124 info->sig_port[!dir]); 128 info->sig_port[!dir]);
@@ -129,8 +133,7 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
129 &addr.ip, port, 133 &addr.ip, port,
130 &ct->tuplehash[!dir].tuple.src.u3.ip, 134 &ct->tuplehash[!dir].tuple.src.u3.ip,
131 info->sig_port[!dir]); 135 info->sig_port[!dir]);
132 return set_h225_addr(skb, protoff, data, 0, 136 return set_h225_addr(skb, data, 0, &taddr[i],
133 &taddr[i],
134 &ct->tuplehash[!dir]. 137 &ct->tuplehash[!dir].
135 tuple.src.u3, 138 tuple.src.u3,
136 info->sig_port[!dir]); 139 info->sig_port[!dir]);
@@ -144,7 +147,7 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
144/****************************************************************************/ 147/****************************************************************************/
145static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct, 148static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,
146 enum ip_conntrack_info ctinfo, 149 enum ip_conntrack_info ctinfo,
147 unsigned int protoff, unsigned char **data, 150 unsigned char **data,
148 TransportAddress *taddr, int count) 151 TransportAddress *taddr, int count)
149{ 152{
150 int dir = CTINFO2DIR(ctinfo); 153 int dir = CTINFO2DIR(ctinfo);
@@ -160,7 +163,7 @@ static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,
160 &addr.ip, ntohs(port), 163 &addr.ip, ntohs(port),
161 &ct->tuplehash[!dir].tuple.dst.u3.ip, 164 &ct->tuplehash[!dir].tuple.dst.u3.ip,
162 ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port)); 165 ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port));
163 return set_h225_addr(skb, protoff, data, 0, &taddr[i], 166 return set_h225_addr(skb, data, 0, &taddr[i],
164 &ct->tuplehash[!dir].tuple.dst.u3, 167 &ct->tuplehash[!dir].tuple.dst.u3,
165 ct->tuplehash[!dir].tuple. 168 ct->tuplehash[!dir].tuple.
166 dst.u.udp.port); 169 dst.u.udp.port);
@@ -173,13 +176,13 @@ static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,
173/****************************************************************************/ 176/****************************************************************************/
174static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, 177static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
175 enum ip_conntrack_info ctinfo, 178 enum ip_conntrack_info ctinfo,
176 unsigned int protoff, unsigned char **data, int dataoff, 179 unsigned char **data, int dataoff,
177 H245_TransportAddress *taddr, 180 H245_TransportAddress *taddr,
178 __be16 port, __be16 rtp_port, 181 __be16 port, __be16 rtp_port,
179 struct nf_conntrack_expect *rtp_exp, 182 struct nf_conntrack_expect *rtp_exp,
180 struct nf_conntrack_expect *rtcp_exp) 183 struct nf_conntrack_expect *rtcp_exp)
181{ 184{
182 struct nf_ct_h323_master *info = nfct_help_data(ct); 185 struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
183 int dir = CTINFO2DIR(ctinfo); 186 int dir = CTINFO2DIR(ctinfo);
184 int i; 187 int i;
185 u_int16_t nated_port; 188 u_int16_t nated_port;
@@ -211,7 +214,8 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
211 214
212 /* Run out of expectations */ 215 /* Run out of expectations */
213 if (i >= H323_RTP_CHANNEL_MAX) { 216 if (i >= H323_RTP_CHANNEL_MAX) {
214 net_notice_ratelimited("nf_nat_h323: out of expectations\n"); 217 if (net_ratelimit())
218 pr_notice("nf_nat_h323: out of expectations\n");
215 return 0; 219 return 0;
216 } 220 }
217 221
@@ -240,12 +244,13 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
240 } 244 }
241 245
242 if (nated_port == 0) { /* No port available */ 246 if (nated_port == 0) { /* No port available */
243 net_notice_ratelimited("nf_nat_h323: out of RTP ports\n"); 247 if (net_ratelimit())
248 pr_notice("nf_nat_h323: out of RTP ports\n");
244 return 0; 249 return 0;
245 } 250 }
246 251
247 /* Modify signal */ 252 /* Modify signal */
248 if (set_h245_addr(skb, protoff, data, dataoff, taddr, 253 if (set_h245_addr(skb, data, dataoff, taddr,
249 &ct->tuplehash[!dir].tuple.dst.u3, 254 &ct->tuplehash[!dir].tuple.dst.u3,
250 htons((port & htons(1)) ? nated_port + 1 : 255 htons((port & htons(1)) ? nated_port + 1 :
251 nated_port)) == 0) { 256 nated_port)) == 0) {
@@ -276,7 +281,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
276/****************************************************************************/ 281/****************************************************************************/
277static int nat_t120(struct sk_buff *skb, struct nf_conn *ct, 282static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
278 enum ip_conntrack_info ctinfo, 283 enum ip_conntrack_info ctinfo,
279 unsigned int protoff, unsigned char **data, int dataoff, 284 unsigned char **data, int dataoff,
280 H245_TransportAddress *taddr, __be16 port, 285 H245_TransportAddress *taddr, __be16 port,
281 struct nf_conntrack_expect *exp) 286 struct nf_conntrack_expect *exp)
282{ 287{
@@ -303,12 +308,13 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
303 } 308 }
304 309
305 if (nated_port == 0) { /* No port available */ 310 if (nated_port == 0) { /* No port available */
306 net_notice_ratelimited("nf_nat_h323: out of TCP ports\n"); 311 if (net_ratelimit())
312 pr_notice("nf_nat_h323: out of TCP ports\n");
307 return 0; 313 return 0;
308 } 314 }
309 315
310 /* Modify signal */ 316 /* Modify signal */
311 if (set_h245_addr(skb, protoff, data, dataoff, taddr, 317 if (set_h245_addr(skb, data, dataoff, taddr,
312 &ct->tuplehash[!dir].tuple.dst.u3, 318 &ct->tuplehash[!dir].tuple.dst.u3,
313 htons(nated_port)) < 0) { 319 htons(nated_port)) < 0) {
314 nf_ct_unexpect_related(exp); 320 nf_ct_unexpect_related(exp);
@@ -327,11 +333,11 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
327/****************************************************************************/ 333/****************************************************************************/
328static int nat_h245(struct sk_buff *skb, struct nf_conn *ct, 334static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
329 enum ip_conntrack_info ctinfo, 335 enum ip_conntrack_info ctinfo,
330 unsigned int protoff, unsigned char **data, int dataoff, 336 unsigned char **data, int dataoff,
331 TransportAddress *taddr, __be16 port, 337 TransportAddress *taddr, __be16 port,
332 struct nf_conntrack_expect *exp) 338 struct nf_conntrack_expect *exp)
333{ 339{
334 struct nf_ct_h323_master *info = nfct_help_data(ct); 340 struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
335 int dir = CTINFO2DIR(ctinfo); 341 int dir = CTINFO2DIR(ctinfo);
336 u_int16_t nated_port = ntohs(port); 342 u_int16_t nated_port = ntohs(port);
337 343
@@ -359,12 +365,13 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
359 } 365 }
360 366
361 if (nated_port == 0) { /* No port available */ 367 if (nated_port == 0) { /* No port available */
362 net_notice_ratelimited("nf_nat_q931: out of TCP ports\n"); 368 if (net_ratelimit())
369 pr_notice("nf_nat_q931: out of TCP ports\n");
363 return 0; 370 return 0;
364 } 371 }
365 372
366 /* Modify signal */ 373 /* Modify signal */
367 if (set_h225_addr(skb, protoff, data, dataoff, taddr, 374 if (set_h225_addr(skb, data, dataoff, taddr,
368 &ct->tuplehash[!dir].tuple.dst.u3, 375 &ct->tuplehash[!dir].tuple.dst.u3,
369 htons(nated_port)) == 0) { 376 htons(nated_port)) == 0) {
370 /* Save ports */ 377 /* Save ports */
@@ -402,27 +409,25 @@ static void ip_nat_q931_expect(struct nf_conn *new,
402 BUG_ON(new->status & IPS_NAT_DONE_MASK); 409 BUG_ON(new->status & IPS_NAT_DONE_MASK);
403 410
404 /* Change src to where master sends to */ 411 /* Change src to where master sends to */
405 range.flags = NF_NAT_RANGE_MAP_IPS; 412 range.flags = IP_NAT_RANGE_MAP_IPS;
406 range.min_addr = range.max_addr = 413 range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip;
407 new->tuplehash[!this->dir].tuple.src.u3; 414 nf_nat_setup_info(new, &range, IP_NAT_MANIP_SRC);
408 nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC);
409 415
410 /* For DST manip, map port here to where it's expected. */ 416 /* For DST manip, map port here to where it's expected. */
411 range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED); 417 range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
412 range.min_proto = range.max_proto = this->saved_proto; 418 range.min = range.max = this->saved_proto;
413 range.min_addr = range.max_addr = 419 range.min_ip = range.max_ip =
414 new->master->tuplehash[!this->dir].tuple.src.u3; 420 new->master->tuplehash[!this->dir].tuple.src.u3.ip;
415 nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST); 421 nf_nat_setup_info(new, &range, IP_NAT_MANIP_DST);
416} 422}
417 423
418/****************************************************************************/ 424/****************************************************************************/
419static int nat_q931(struct sk_buff *skb, struct nf_conn *ct, 425static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
420 enum ip_conntrack_info ctinfo, 426 enum ip_conntrack_info ctinfo,
421 unsigned int protoff, unsigned char **data, 427 unsigned char **data, TransportAddress *taddr, int idx,
422 TransportAddress *taddr, int idx,
423 __be16 port, struct nf_conntrack_expect *exp) 428 __be16 port, struct nf_conntrack_expect *exp)
424{ 429{
425 struct nf_ct_h323_master *info = nfct_help_data(ct); 430 struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info;
426 int dir = CTINFO2DIR(ctinfo); 431 int dir = CTINFO2DIR(ctinfo);
427 u_int16_t nated_port = ntohs(port); 432 u_int16_t nated_port = ntohs(port);
428 union nf_inet_addr addr; 433 union nf_inet_addr addr;
@@ -451,12 +456,13 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
451 } 456 }
452 457
453 if (nated_port == 0) { /* No port available */ 458 if (nated_port == 0) { /* No port available */
454 net_notice_ratelimited("nf_nat_ras: out of TCP ports\n"); 459 if (net_ratelimit())
460 pr_notice("nf_nat_ras: out of TCP ports\n");
455 return 0; 461 return 0;
456 } 462 }
457 463
458 /* Modify signal */ 464 /* Modify signal */
459 if (set_h225_addr(skb, protoff, data, 0, &taddr[idx], 465 if (set_h225_addr(skb, data, 0, &taddr[idx],
460 &ct->tuplehash[!dir].tuple.dst.u3, 466 &ct->tuplehash[!dir].tuple.dst.u3,
461 htons(nated_port)) == 0) { 467 htons(nated_port)) == 0) {
462 /* Save ports */ 468 /* Save ports */
@@ -467,7 +473,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
467 if (idx > 0 && 473 if (idx > 0 &&
468 get_h225_addr(ct, *data, &taddr[0], &addr, &port) && 474 get_h225_addr(ct, *data, &taddr[0], &addr, &port) &&
469 (ntohl(addr.ip) & 0xff000000) == 0x7f000000) { 475 (ntohl(addr.ip) & 0xff000000) == 0x7f000000) {
470 set_h225_addr(skb, protoff, data, 0, &taddr[0], 476 set_h225_addr(skb, data, 0, &taddr[0],
471 &ct->tuplehash[!dir].tuple.dst.u3, 477 &ct->tuplehash[!dir].tuple.dst.u3,
472 info->sig_port[!dir]); 478 info->sig_port[!dir]);
473 } 479 }
@@ -496,22 +502,20 @@ static void ip_nat_callforwarding_expect(struct nf_conn *new,
496 BUG_ON(new->status & IPS_NAT_DONE_MASK); 502 BUG_ON(new->status & IPS_NAT_DONE_MASK);
497 503
498 /* Change src to where master sends to */ 504 /* Change src to where master sends to */
499 range.flags = NF_NAT_RANGE_MAP_IPS; 505 range.flags = IP_NAT_RANGE_MAP_IPS;
500 range.min_addr = range.max_addr = 506 range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip;
501 new->tuplehash[!this->dir].tuple.src.u3; 507 nf_nat_setup_info(new, &range, IP_NAT_MANIP_SRC);
502 nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC);
503 508
504 /* For DST manip, map port here to where it's expected. */ 509 /* For DST manip, map port here to where it's expected. */
505 range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED); 510 range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
506 range.min_proto = range.max_proto = this->saved_proto; 511 range.min = range.max = this->saved_proto;
507 range.min_addr = range.max_addr = this->saved_addr; 512 range.min_ip = range.max_ip = this->saved_ip;
508 nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST); 513 nf_nat_setup_info(new, &range, IP_NAT_MANIP_DST);
509} 514}
510 515
511/****************************************************************************/ 516/****************************************************************************/
512static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct, 517static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
513 enum ip_conntrack_info ctinfo, 518 enum ip_conntrack_info ctinfo,
514 unsigned int protoff,
515 unsigned char **data, int dataoff, 519 unsigned char **data, int dataoff,
516 TransportAddress *taddr, __be16 port, 520 TransportAddress *taddr, __be16 port,
517 struct nf_conntrack_expect *exp) 521 struct nf_conntrack_expect *exp)
@@ -520,7 +524,7 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
520 u_int16_t nated_port; 524 u_int16_t nated_port;
521 525
522 /* Set expectations for NAT */ 526 /* Set expectations for NAT */
523 exp->saved_addr = exp->tuple.dst.u3; 527 exp->saved_ip = exp->tuple.dst.u3.ip;
524 exp->tuple.dst.u3.ip = ct->tuplehash[!dir].tuple.dst.u3.ip; 528 exp->tuple.dst.u3.ip = ct->tuplehash[!dir].tuple.dst.u3.ip;
525 exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; 529 exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
526 exp->expectfn = ip_nat_callforwarding_expect; 530 exp->expectfn = ip_nat_callforwarding_expect;
@@ -541,12 +545,13 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
541 } 545 }
542 546
543 if (nated_port == 0) { /* No port available */ 547 if (nated_port == 0) { /* No port available */
544 net_notice_ratelimited("nf_nat_q931: out of TCP ports\n"); 548 if (net_ratelimit())
549 pr_notice("nf_nat_q931: out of TCP ports\n");
545 return 0; 550 return 0;
546 } 551 }
547 552
548 /* Modify signal */ 553 /* Modify signal */
549 if (!set_h225_addr(skb, protoff, data, dataoff, taddr, 554 if (!set_h225_addr(skb, data, dataoff, taddr,
550 &ct->tuplehash[!dir].tuple.dst.u3, 555 &ct->tuplehash[!dir].tuple.dst.u3,
551 htons(nated_port)) == 0) { 556 htons(nated_port)) == 0) {
552 nf_ct_unexpect_related(exp); 557 nf_ct_unexpect_related(exp);
@@ -563,16 +568,6 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
563 return 0; 568 return 0;
564} 569}
565 570
566static struct nf_ct_helper_expectfn q931_nat = {
567 .name = "Q.931",
568 .expectfn = ip_nat_q931_expect,
569};
570
571static struct nf_ct_helper_expectfn callforwarding_nat = {
572 .name = "callforwarding",
573 .expectfn = ip_nat_callforwarding_expect,
574};
575
576/****************************************************************************/ 571/****************************************************************************/
577static int __init init(void) 572static int __init init(void)
578{ 573{
@@ -586,34 +581,30 @@ static int __init init(void)
586 BUG_ON(nat_callforwarding_hook != NULL); 581 BUG_ON(nat_callforwarding_hook != NULL);
587 BUG_ON(nat_q931_hook != NULL); 582 BUG_ON(nat_q931_hook != NULL);
588 583
589 RCU_INIT_POINTER(set_h245_addr_hook, set_h245_addr); 584 rcu_assign_pointer(set_h245_addr_hook, set_h245_addr);
590 RCU_INIT_POINTER(set_h225_addr_hook, set_h225_addr); 585 rcu_assign_pointer(set_h225_addr_hook, set_h225_addr);
591 RCU_INIT_POINTER(set_sig_addr_hook, set_sig_addr); 586 rcu_assign_pointer(set_sig_addr_hook, set_sig_addr);
592 RCU_INIT_POINTER(set_ras_addr_hook, set_ras_addr); 587 rcu_assign_pointer(set_ras_addr_hook, set_ras_addr);
593 RCU_INIT_POINTER(nat_rtp_rtcp_hook, nat_rtp_rtcp); 588 rcu_assign_pointer(nat_rtp_rtcp_hook, nat_rtp_rtcp);
594 RCU_INIT_POINTER(nat_t120_hook, nat_t120); 589 rcu_assign_pointer(nat_t120_hook, nat_t120);
595 RCU_INIT_POINTER(nat_h245_hook, nat_h245); 590 rcu_assign_pointer(nat_h245_hook, nat_h245);
596 RCU_INIT_POINTER(nat_callforwarding_hook, nat_callforwarding); 591 rcu_assign_pointer(nat_callforwarding_hook, nat_callforwarding);
597 RCU_INIT_POINTER(nat_q931_hook, nat_q931); 592 rcu_assign_pointer(nat_q931_hook, nat_q931);
598 nf_ct_helper_expectfn_register(&q931_nat);
599 nf_ct_helper_expectfn_register(&callforwarding_nat);
600 return 0; 593 return 0;
601} 594}
602 595
603/****************************************************************************/ 596/****************************************************************************/
604static void __exit fini(void) 597static void __exit fini(void)
605{ 598{
606 RCU_INIT_POINTER(set_h245_addr_hook, NULL); 599 rcu_assign_pointer(set_h245_addr_hook, NULL);
607 RCU_INIT_POINTER(set_h225_addr_hook, NULL); 600 rcu_assign_pointer(set_h225_addr_hook, NULL);
608 RCU_INIT_POINTER(set_sig_addr_hook, NULL); 601 rcu_assign_pointer(set_sig_addr_hook, NULL);
609 RCU_INIT_POINTER(set_ras_addr_hook, NULL); 602 rcu_assign_pointer(set_ras_addr_hook, NULL);
610 RCU_INIT_POINTER(nat_rtp_rtcp_hook, NULL); 603 rcu_assign_pointer(nat_rtp_rtcp_hook, NULL);
611 RCU_INIT_POINTER(nat_t120_hook, NULL); 604 rcu_assign_pointer(nat_t120_hook, NULL);
612 RCU_INIT_POINTER(nat_h245_hook, NULL); 605 rcu_assign_pointer(nat_h245_hook, NULL);
613 RCU_INIT_POINTER(nat_callforwarding_hook, NULL); 606 rcu_assign_pointer(nat_callforwarding_hook, NULL);
614 RCU_INIT_POINTER(nat_q931_hook, NULL); 607 rcu_assign_pointer(nat_q931_hook, NULL);
615 nf_ct_helper_expectfn_unregister(&q931_nat);
616 nf_ct_helper_expectfn_unregister(&callforwarding_nat);
617 synchronize_rcu(); 608 synchronize_rcu();
618} 609}
619 610
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
deleted file mode 100644
index d8b2e14efdd..00000000000
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ /dev/null
@@ -1,281 +0,0 @@
1/*
2 * (C) 1999-2001 Paul `Rusty' Russell
3 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
4 * (C) 2011 Patrick McHardy <kaber@trash.net>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/types.h>
12#include <linux/module.h>
13#include <linux/skbuff.h>
14#include <linux/ip.h>
15#include <linux/icmp.h>
16#include <linux/netfilter.h>
17#include <linux/netfilter_ipv4.h>
18#include <net/secure_seq.h>
19#include <net/checksum.h>
20#include <net/route.h>
21#include <net/ip.h>
22
23#include <net/netfilter/nf_conntrack_core.h>
24#include <net/netfilter/nf_conntrack.h>
25#include <net/netfilter/nf_nat_core.h>
26#include <net/netfilter/nf_nat_l3proto.h>
27#include <net/netfilter/nf_nat_l4proto.h>
28
29static const struct nf_nat_l3proto nf_nat_l3proto_ipv4;
30
31#ifdef CONFIG_XFRM
32static void nf_nat_ipv4_decode_session(struct sk_buff *skb,
33 const struct nf_conn *ct,
34 enum ip_conntrack_dir dir,
35 unsigned long statusbit,
36 struct flowi *fl)
37{
38 const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple;
39 struct flowi4 *fl4 = &fl->u.ip4;
40
41 if (ct->status & statusbit) {
42 fl4->daddr = t->dst.u3.ip;
43 if (t->dst.protonum == IPPROTO_TCP ||
44 t->dst.protonum == IPPROTO_UDP ||
45 t->dst.protonum == IPPROTO_UDPLITE ||
46 t->dst.protonum == IPPROTO_DCCP ||
47 t->dst.protonum == IPPROTO_SCTP)
48 fl4->fl4_dport = t->dst.u.all;
49 }
50
51 statusbit ^= IPS_NAT_MASK;
52
53 if (ct->status & statusbit) {
54 fl4->saddr = t->src.u3.ip;
55 if (t->dst.protonum == IPPROTO_TCP ||
56 t->dst.protonum == IPPROTO_UDP ||
57 t->dst.protonum == IPPROTO_UDPLITE ||
58 t->dst.protonum == IPPROTO_DCCP ||
59 t->dst.protonum == IPPROTO_SCTP)
60 fl4->fl4_sport = t->src.u.all;
61 }
62}
63#endif /* CONFIG_XFRM */
64
65static bool nf_nat_ipv4_in_range(const struct nf_conntrack_tuple *t,
66 const struct nf_nat_range *range)
67{
68 return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) &&
69 ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip);
70}
71
72static u32 nf_nat_ipv4_secure_port(const struct nf_conntrack_tuple *t,
73 __be16 dport)
74{
75 return secure_ipv4_port_ephemeral(t->src.u3.ip, t->dst.u3.ip, dport);
76}
77
78static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb,
79 unsigned int iphdroff,
80 const struct nf_nat_l4proto *l4proto,
81 const struct nf_conntrack_tuple *target,
82 enum nf_nat_manip_type maniptype)
83{
84 struct iphdr *iph;
85 unsigned int hdroff;
86
87 if (!skb_make_writable(skb, iphdroff + sizeof(*iph)))
88 return false;
89
90 iph = (void *)skb->data + iphdroff;
91 hdroff = iphdroff + iph->ihl * 4;
92
93 if (!l4proto->manip_pkt(skb, &nf_nat_l3proto_ipv4, iphdroff, hdroff,
94 target, maniptype))
95 return false;
96 iph = (void *)skb->data + iphdroff;
97
98 if (maniptype == NF_NAT_MANIP_SRC) {
99 csum_replace4(&iph->check, iph->saddr, target->src.u3.ip);
100 iph->saddr = target->src.u3.ip;
101 } else {
102 csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip);
103 iph->daddr = target->dst.u3.ip;
104 }
105 return true;
106}
107
108static void nf_nat_ipv4_csum_update(struct sk_buff *skb,
109 unsigned int iphdroff, __sum16 *check,
110 const struct nf_conntrack_tuple *t,
111 enum nf_nat_manip_type maniptype)
112{
113 struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
114 __be32 oldip, newip;
115
116 if (maniptype == NF_NAT_MANIP_SRC) {
117 oldip = iph->saddr;
118 newip = t->src.u3.ip;
119 } else {
120 oldip = iph->daddr;
121 newip = t->dst.u3.ip;
122 }
123 inet_proto_csum_replace4(check, skb, oldip, newip, 1);
124}
125
126static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
127 u8 proto, void *data, __sum16 *check,
128 int datalen, int oldlen)
129{
130 const struct iphdr *iph = ip_hdr(skb);
131 struct rtable *rt = skb_rtable(skb);
132
133 if (skb->ip_summed != CHECKSUM_PARTIAL) {
134 if (!(rt->rt_flags & RTCF_LOCAL) &&
135 (!skb->dev || skb->dev->features & NETIF_F_V4_CSUM)) {
136 skb->ip_summed = CHECKSUM_PARTIAL;
137 skb->csum_start = skb_headroom(skb) +
138 skb_network_offset(skb) +
139 ip_hdrlen(skb);
140 skb->csum_offset = (void *)check - data;
141 *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
142 datalen, proto, 0);
143 } else {
144 *check = 0;
145 *check = csum_tcpudp_magic(iph->saddr, iph->daddr,
146 datalen, proto,
147 csum_partial(data, datalen,
148 0));
149 if (proto == IPPROTO_UDP && !*check)
150 *check = CSUM_MANGLED_0;
151 }
152 } else
153 inet_proto_csum_replace2(check, skb,
154 htons(oldlen), htons(datalen), 1);
155}
156
157static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
158 struct nf_nat_range *range)
159{
160 if (tb[CTA_NAT_V4_MINIP]) {
161 range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]);
162 range->flags |= NF_NAT_RANGE_MAP_IPS;
163 }
164
165 if (tb[CTA_NAT_V4_MAXIP])
166 range->max_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MAXIP]);
167 else
168 range->max_addr.ip = range->min_addr.ip;
169
170 return 0;
171}
172
173static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = {
174 .l3proto = NFPROTO_IPV4,
175 .in_range = nf_nat_ipv4_in_range,
176 .secure_port = nf_nat_ipv4_secure_port,
177 .manip_pkt = nf_nat_ipv4_manip_pkt,
178 .csum_update = nf_nat_ipv4_csum_update,
179 .csum_recalc = nf_nat_ipv4_csum_recalc,
180 .nlattr_to_range = nf_nat_ipv4_nlattr_to_range,
181#ifdef CONFIG_XFRM
182 .decode_session = nf_nat_ipv4_decode_session,
183#endif
184};
185
186int nf_nat_icmp_reply_translation(struct sk_buff *skb,
187 struct nf_conn *ct,
188 enum ip_conntrack_info ctinfo,
189 unsigned int hooknum)
190{
191 struct {
192 struct icmphdr icmp;
193 struct iphdr ip;
194 } *inside;
195 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
196 enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
197 unsigned int hdrlen = ip_hdrlen(skb);
198 const struct nf_nat_l4proto *l4proto;
199 struct nf_conntrack_tuple target;
200 unsigned long statusbit;
201
202 NF_CT_ASSERT(ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY);
203
204 if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
205 return 0;
206 if (nf_ip_checksum(skb, hooknum, hdrlen, 0))
207 return 0;
208
209 inside = (void *)skb->data + hdrlen;
210 if (inside->icmp.type == ICMP_REDIRECT) {
211 if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
212 return 0;
213 if (ct->status & IPS_NAT_MASK)
214 return 0;
215 }
216
217 if (manip == NF_NAT_MANIP_SRC)
218 statusbit = IPS_SRC_NAT;
219 else
220 statusbit = IPS_DST_NAT;
221
222 /* Invert if this is reply direction */
223 if (dir == IP_CT_DIR_REPLY)
224 statusbit ^= IPS_NAT_MASK;
225
226 if (!(ct->status & statusbit))
227 return 1;
228
229 l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, inside->ip.protocol);
230 if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp),
231 l4proto, &ct->tuplehash[!dir].tuple, !manip))
232 return 0;
233
234 if (skb->ip_summed != CHECKSUM_PARTIAL) {
235 /* Reloading "inside" here since manip_pkt may reallocate */
236 inside = (void *)skb->data + hdrlen;
237 inside->icmp.checksum = 0;
238 inside->icmp.checksum =
239 csum_fold(skb_checksum(skb, hdrlen,
240 skb->len - hdrlen, 0));
241 }
242
243 /* Change outer to look like the reply to an incoming packet */
244 nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
245 l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, 0);
246 if (!nf_nat_ipv4_manip_pkt(skb, 0, l4proto, &target, manip))
247 return 0;
248
249 return 1;
250}
251EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
252
253static int __init nf_nat_l3proto_ipv4_init(void)
254{
255 int err;
256
257 err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
258 if (err < 0)
259 goto err1;
260 err = nf_nat_l3proto_register(&nf_nat_l3proto_ipv4);
261 if (err < 0)
262 goto err2;
263 return err;
264
265err2:
266 nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
267err1:
268 return err;
269}
270
271static void __exit nf_nat_l3proto_ipv4_exit(void)
272{
273 nf_nat_l3proto_unregister(&nf_nat_l3proto_ipv4);
274 nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
275}
276
277MODULE_LICENSE("GPL");
278MODULE_ALIAS("nf-nat-" __stringify(AF_INET));
279
280module_init(nf_nat_l3proto_ipv4_init);
281module_exit(nf_nat_l3proto_ipv4_exit);
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index a06d7d74817..4c060038d29 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -22,6 +22,7 @@
22 22
23#include <net/netfilter/nf_nat.h> 23#include <net/netfilter/nf_nat.h>
24#include <net/netfilter/nf_nat_helper.h> 24#include <net/netfilter/nf_nat_helper.h>
25#include <net/netfilter/nf_nat_rule.h>
25#include <net/netfilter/nf_conntrack_helper.h> 26#include <net/netfilter/nf_conntrack_helper.h>
26#include <net/netfilter/nf_conntrack_expect.h> 27#include <net/netfilter/nf_conntrack_expect.h>
27#include <net/netfilter/nf_conntrack_zones.h> 28#include <net/netfilter/nf_conntrack_zones.h>
@@ -48,7 +49,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
48 const struct nf_nat_pptp *nat_pptp_info; 49 const struct nf_nat_pptp *nat_pptp_info;
49 struct nf_nat_range range; 50 struct nf_nat_range range;
50 51
51 ct_pptp_info = nfct_help_data(master); 52 ct_pptp_info = &nfct_help(master)->help.ct_pptp_info;
52 nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info; 53 nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info;
53 54
54 /* And here goes the grand finale of corrosion... */ 55 /* And here goes the grand finale of corrosion... */
@@ -87,24 +88,24 @@ static void pptp_nat_expected(struct nf_conn *ct,
87 BUG_ON(ct->status & IPS_NAT_DONE_MASK); 88 BUG_ON(ct->status & IPS_NAT_DONE_MASK);
88 89
89 /* Change src to where master sends to */ 90 /* Change src to where master sends to */
90 range.flags = NF_NAT_RANGE_MAP_IPS; 91 range.flags = IP_NAT_RANGE_MAP_IPS;
91 range.min_addr = range.max_addr 92 range.min_ip = range.max_ip
92 = ct->master->tuplehash[!exp->dir].tuple.dst.u3; 93 = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip;
93 if (exp->dir == IP_CT_DIR_ORIGINAL) { 94 if (exp->dir == IP_CT_DIR_ORIGINAL) {
94 range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; 95 range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
95 range.min_proto = range.max_proto = exp->saved_proto; 96 range.min = range.max = exp->saved_proto;
96 } 97 }
97 nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC); 98 nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC);
98 99
99 /* For DST manip, map port here to where it's expected. */ 100 /* For DST manip, map port here to where it's expected. */
100 range.flags = NF_NAT_RANGE_MAP_IPS; 101 range.flags = IP_NAT_RANGE_MAP_IPS;
101 range.min_addr = range.max_addr 102 range.min_ip = range.max_ip
102 = ct->master->tuplehash[!exp->dir].tuple.src.u3; 103 = ct->master->tuplehash[!exp->dir].tuple.src.u3.ip;
103 if (exp->dir == IP_CT_DIR_REPLY) { 104 if (exp->dir == IP_CT_DIR_REPLY) {
104 range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; 105 range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
105 range.min_proto = range.max_proto = exp->saved_proto; 106 range.min = range.max = exp->saved_proto;
106 } 107 }
107 nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST); 108 nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST);
108} 109}
109 110
110/* outbound packets == from PNS to PAC */ 111/* outbound packets == from PNS to PAC */
@@ -112,7 +113,6 @@ static int
112pptp_outbound_pkt(struct sk_buff *skb, 113pptp_outbound_pkt(struct sk_buff *skb,
113 struct nf_conn *ct, 114 struct nf_conn *ct,
114 enum ip_conntrack_info ctinfo, 115 enum ip_conntrack_info ctinfo,
115 unsigned int protoff,
116 struct PptpControlHeader *ctlh, 116 struct PptpControlHeader *ctlh,
117 union pptp_ctrl_union *pptpReq) 117 union pptp_ctrl_union *pptpReq)
118 118
@@ -123,7 +123,7 @@ pptp_outbound_pkt(struct sk_buff *skb,
123 __be16 new_callid; 123 __be16 new_callid;
124 unsigned int cid_off; 124 unsigned int cid_off;
125 125
126 ct_pptp_info = nfct_help_data(ct); 126 ct_pptp_info = &nfct_help(ct)->help.ct_pptp_info;
127 nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info; 127 nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;
128 128
129 new_callid = ct_pptp_info->pns_call_id; 129 new_callid = ct_pptp_info->pns_call_id;
@@ -175,7 +175,7 @@ pptp_outbound_pkt(struct sk_buff *skb,
175 ntohs(REQ_CID(pptpReq, cid_off)), ntohs(new_callid)); 175 ntohs(REQ_CID(pptpReq, cid_off)), ntohs(new_callid));
176 176
177 /* mangle packet */ 177 /* mangle packet */
178 if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, 178 if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
179 cid_off + sizeof(struct pptp_pkt_hdr) + 179 cid_off + sizeof(struct pptp_pkt_hdr) +
180 sizeof(struct PptpControlHeader), 180 sizeof(struct PptpControlHeader),
181 sizeof(new_callid), (char *)&new_callid, 181 sizeof(new_callid), (char *)&new_callid,
@@ -192,7 +192,7 @@ pptp_exp_gre(struct nf_conntrack_expect *expect_orig,
192 struct nf_ct_pptp_master *ct_pptp_info; 192 struct nf_ct_pptp_master *ct_pptp_info;
193 struct nf_nat_pptp *nat_pptp_info; 193 struct nf_nat_pptp *nat_pptp_info;
194 194
195 ct_pptp_info = nfct_help_data(ct); 195 ct_pptp_info = &nfct_help(ct)->help.ct_pptp_info;
196 nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info; 196 nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;
197 197
198 /* save original PAC call ID in nat_info */ 198 /* save original PAC call ID in nat_info */
@@ -216,7 +216,6 @@ static int
216pptp_inbound_pkt(struct sk_buff *skb, 216pptp_inbound_pkt(struct sk_buff *skb,
217 struct nf_conn *ct, 217 struct nf_conn *ct,
218 enum ip_conntrack_info ctinfo, 218 enum ip_conntrack_info ctinfo,
219 unsigned int protoff,
220 struct PptpControlHeader *ctlh, 219 struct PptpControlHeader *ctlh,
221 union pptp_ctrl_union *pptpReq) 220 union pptp_ctrl_union *pptpReq)
222{ 221{
@@ -269,7 +268,7 @@ pptp_inbound_pkt(struct sk_buff *skb,
269 pr_debug("altering peer call id from 0x%04x to 0x%04x\n", 268 pr_debug("altering peer call id from 0x%04x to 0x%04x\n",
270 ntohs(REQ_CID(pptpReq, pcid_off)), ntohs(new_pcid)); 269 ntohs(REQ_CID(pptpReq, pcid_off)), ntohs(new_pcid));
271 270
272 if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, 271 if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
273 pcid_off + sizeof(struct pptp_pkt_hdr) + 272 pcid_off + sizeof(struct pptp_pkt_hdr) +
274 sizeof(struct PptpControlHeader), 273 sizeof(struct PptpControlHeader),
275 sizeof(new_pcid), (char *)&new_pcid, 274 sizeof(new_pcid), (char *)&new_pcid,
@@ -283,25 +282,25 @@ static int __init nf_nat_helper_pptp_init(void)
283 nf_nat_need_gre(); 282 nf_nat_need_gre();
284 283
285 BUG_ON(nf_nat_pptp_hook_outbound != NULL); 284 BUG_ON(nf_nat_pptp_hook_outbound != NULL);
286 RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, pptp_outbound_pkt); 285 rcu_assign_pointer(nf_nat_pptp_hook_outbound, pptp_outbound_pkt);
287 286
288 BUG_ON(nf_nat_pptp_hook_inbound != NULL); 287 BUG_ON(nf_nat_pptp_hook_inbound != NULL);
289 RCU_INIT_POINTER(nf_nat_pptp_hook_inbound, pptp_inbound_pkt); 288 rcu_assign_pointer(nf_nat_pptp_hook_inbound, pptp_inbound_pkt);
290 289
291 BUG_ON(nf_nat_pptp_hook_exp_gre != NULL); 290 BUG_ON(nf_nat_pptp_hook_exp_gre != NULL);
292 RCU_INIT_POINTER(nf_nat_pptp_hook_exp_gre, pptp_exp_gre); 291 rcu_assign_pointer(nf_nat_pptp_hook_exp_gre, pptp_exp_gre);
293 292
294 BUG_ON(nf_nat_pptp_hook_expectfn != NULL); 293 BUG_ON(nf_nat_pptp_hook_expectfn != NULL);
295 RCU_INIT_POINTER(nf_nat_pptp_hook_expectfn, pptp_nat_expected); 294 rcu_assign_pointer(nf_nat_pptp_hook_expectfn, pptp_nat_expected);
296 return 0; 295 return 0;
297} 296}
298 297
299static void __exit nf_nat_helper_pptp_fini(void) 298static void __exit nf_nat_helper_pptp_fini(void)
300{ 299{
301 RCU_INIT_POINTER(nf_nat_pptp_hook_expectfn, NULL); 300 rcu_assign_pointer(nf_nat_pptp_hook_expectfn, NULL);
302 RCU_INIT_POINTER(nf_nat_pptp_hook_exp_gre, NULL); 301 rcu_assign_pointer(nf_nat_pptp_hook_exp_gre, NULL);
303 RCU_INIT_POINTER(nf_nat_pptp_hook_inbound, NULL); 302 rcu_assign_pointer(nf_nat_pptp_hook_inbound, NULL);
304 RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, NULL); 303 rcu_assign_pointer(nf_nat_pptp_hook_outbound, NULL);
305 synchronize_rcu(); 304 synchronize_rcu();
306} 305}
307 306
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index ea44f02563b..bc8d83a31c7 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -28,7 +28,8 @@
28#include <linux/ip.h> 28#include <linux/ip.h>
29 29
30#include <net/netfilter/nf_nat.h> 30#include <net/netfilter/nf_nat.h>
31#include <net/netfilter/nf_nat_l4proto.h> 31#include <net/netfilter/nf_nat_rule.h>
32#include <net/netfilter/nf_nat_protocol.h>
32#include <linux/netfilter/nf_conntrack_proto_gre.h> 33#include <linux/netfilter/nf_conntrack_proto_gre.h>
33 34
34MODULE_LICENSE("GPL"); 35MODULE_LICENSE("GPL");
@@ -37,8 +38,7 @@ MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
37 38
38/* generate unique tuple ... */ 39/* generate unique tuple ... */
39static void 40static void
40gre_unique_tuple(const struct nf_nat_l3proto *l3proto, 41gre_unique_tuple(struct nf_conntrack_tuple *tuple,
41 struct nf_conntrack_tuple *tuple,
42 const struct nf_nat_range *range, 42 const struct nf_nat_range *range,
43 enum nf_nat_manip_type maniptype, 43 enum nf_nat_manip_type maniptype,
44 const struct nf_conn *ct) 44 const struct nf_conn *ct)
@@ -52,18 +52,18 @@ gre_unique_tuple(const struct nf_nat_l3proto *l3proto,
52 if (!ct->master) 52 if (!ct->master)
53 return; 53 return;
54 54
55 if (maniptype == NF_NAT_MANIP_SRC) 55 if (maniptype == IP_NAT_MANIP_SRC)
56 keyptr = &tuple->src.u.gre.key; 56 keyptr = &tuple->src.u.gre.key;
57 else 57 else
58 keyptr = &tuple->dst.u.gre.key; 58 keyptr = &tuple->dst.u.gre.key;
59 59
60 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { 60 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
61 pr_debug("%p: NATing GRE PPTP\n", ct); 61 pr_debug("%p: NATing GRE PPTP\n", ct);
62 min = 1; 62 min = 1;
63 range_size = 0xffff; 63 range_size = 0xffff;
64 } else { 64 } else {
65 min = ntohs(range->min_proto.gre.key); 65 min = ntohs(range->min.gre.key);
66 range_size = ntohs(range->max_proto.gre.key) - min + 1; 66 range_size = ntohs(range->max.gre.key) - min + 1;
67 } 67 }
68 68
69 pr_debug("min = %u, range_size = %u\n", min, range_size); 69 pr_debug("min = %u, range_size = %u\n", min, range_size);
@@ -80,14 +80,14 @@ gre_unique_tuple(const struct nf_nat_l3proto *l3proto,
80 80
81/* manipulate a GRE packet according to maniptype */ 81/* manipulate a GRE packet according to maniptype */
82static bool 82static bool
83gre_manip_pkt(struct sk_buff *skb, 83gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff,
84 const struct nf_nat_l3proto *l3proto,
85 unsigned int iphdroff, unsigned int hdroff,
86 const struct nf_conntrack_tuple *tuple, 84 const struct nf_conntrack_tuple *tuple,
87 enum nf_nat_manip_type maniptype) 85 enum nf_nat_manip_type maniptype)
88{ 86{
89 const struct gre_hdr *greh; 87 const struct gre_hdr *greh;
90 struct gre_hdr_pptp *pgreh; 88 struct gre_hdr_pptp *pgreh;
89 const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
90 unsigned int hdroff = iphdroff + iph->ihl * 4;
91 91
92 /* pgreh includes two optional 32bit fields which are not required 92 /* pgreh includes two optional 32bit fields which are not required
93 * to be there. That's where the magic '8' comes from */ 93 * to be there. That's where the magic '8' comes from */
@@ -99,7 +99,7 @@ gre_manip_pkt(struct sk_buff *skb,
99 99
100 /* we only have destination manip of a packet, since 'source key' 100 /* we only have destination manip of a packet, since 'source key'
101 * is not present in the packet itself */ 101 * is not present in the packet itself */
102 if (maniptype != NF_NAT_MANIP_DST) 102 if (maniptype != IP_NAT_MANIP_DST)
103 return true; 103 return true;
104 switch (greh->version) { 104 switch (greh->version) {
105 case GRE_VERSION_1701: 105 case GRE_VERSION_1701:
@@ -117,24 +117,26 @@ gre_manip_pkt(struct sk_buff *skb,
117 return true; 117 return true;
118} 118}
119 119
120static const struct nf_nat_l4proto gre = { 120static const struct nf_nat_protocol gre = {
121 .l4proto = IPPROTO_GRE, 121 .protonum = IPPROTO_GRE,
122 .me = THIS_MODULE,
122 .manip_pkt = gre_manip_pkt, 123 .manip_pkt = gre_manip_pkt,
123 .in_range = nf_nat_l4proto_in_range, 124 .in_range = nf_nat_proto_in_range,
124 .unique_tuple = gre_unique_tuple, 125 .unique_tuple = gre_unique_tuple,
125#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 126#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
126 .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, 127 .range_to_nlattr = nf_nat_proto_range_to_nlattr,
128 .nlattr_to_range = nf_nat_proto_nlattr_to_range,
127#endif 129#endif
128}; 130};
129 131
130static int __init nf_nat_proto_gre_init(void) 132static int __init nf_nat_proto_gre_init(void)
131{ 133{
132 return nf_nat_l4proto_register(NFPROTO_IPV4, &gre); 134 return nf_nat_protocol_register(&gre);
133} 135}
134 136
135static void __exit nf_nat_proto_gre_fini(void) 137static void __exit nf_nat_proto_gre_fini(void)
136{ 138{
137 nf_nat_l4proto_unregister(NFPROTO_IPV4, &gre); 139 nf_nat_protocol_unregister(&gre);
138} 140}
139 141
140module_init(nf_nat_proto_gre_init); 142module_init(nf_nat_proto_gre_init);
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index eb303471bcf..5744c3ec847 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -8,14 +8,14 @@
8 8
9#include <linux/types.h> 9#include <linux/types.h>
10#include <linux/init.h> 10#include <linux/init.h>
11#include <linux/export.h>
12#include <linux/ip.h> 11#include <linux/ip.h>
13#include <linux/icmp.h> 12#include <linux/icmp.h>
14 13
15#include <linux/netfilter.h> 14#include <linux/netfilter.h>
16#include <net/netfilter/nf_nat.h> 15#include <net/netfilter/nf_nat.h>
17#include <net/netfilter/nf_nat_core.h> 16#include <net/netfilter/nf_nat_core.h>
18#include <net/netfilter/nf_nat_l4proto.h> 17#include <net/netfilter/nf_nat_rule.h>
18#include <net/netfilter/nf_nat_protocol.h>
19 19
20static bool 20static bool
21icmp_in_range(const struct nf_conntrack_tuple *tuple, 21icmp_in_range(const struct nf_conntrack_tuple *tuple,
@@ -28,8 +28,7 @@ icmp_in_range(const struct nf_conntrack_tuple *tuple,
28} 28}
29 29
30static void 30static void
31icmp_unique_tuple(const struct nf_nat_l3proto *l3proto, 31icmp_unique_tuple(struct nf_conntrack_tuple *tuple,
32 struct nf_conntrack_tuple *tuple,
33 const struct nf_nat_range *range, 32 const struct nf_nat_range *range,
34 enum nf_nat_manip_type maniptype, 33 enum nf_nat_manip_type maniptype,
35 const struct nf_conn *ct) 34 const struct nf_conn *ct)
@@ -38,14 +37,13 @@ icmp_unique_tuple(const struct nf_nat_l3proto *l3proto,
38 unsigned int range_size; 37 unsigned int range_size;
39 unsigned int i; 38 unsigned int i;
40 39
41 range_size = ntohs(range->max_proto.icmp.id) - 40 range_size = ntohs(range->max.icmp.id) - ntohs(range->min.icmp.id) + 1;
42 ntohs(range->min_proto.icmp.id) + 1;
43 /* If no range specified... */ 41 /* If no range specified... */
44 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) 42 if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED))
45 range_size = 0xFFFF; 43 range_size = 0xFFFF;
46 44
47 for (i = 0; ; ++id) { 45 for (i = 0; ; ++id) {
48 tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) + 46 tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) +
49 (id % range_size)); 47 (id % range_size));
50 if (++i == range_size || !nf_nat_used_tuple(tuple, ct)) 48 if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
51 return; 49 return;
@@ -55,12 +53,13 @@ icmp_unique_tuple(const struct nf_nat_l3proto *l3proto,
55 53
56static bool 54static bool
57icmp_manip_pkt(struct sk_buff *skb, 55icmp_manip_pkt(struct sk_buff *skb,
58 const struct nf_nat_l3proto *l3proto, 56 unsigned int iphdroff,
59 unsigned int iphdroff, unsigned int hdroff,
60 const struct nf_conntrack_tuple *tuple, 57 const struct nf_conntrack_tuple *tuple,
61 enum nf_nat_manip_type maniptype) 58 enum nf_nat_manip_type maniptype)
62{ 59{
60 const struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
63 struct icmphdr *hdr; 61 struct icmphdr *hdr;
62 unsigned int hdroff = iphdroff + iph->ihl*4;
64 63
65 if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) 64 if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
66 return false; 65 return false;
@@ -72,12 +71,14 @@ icmp_manip_pkt(struct sk_buff *skb,
72 return true; 71 return true;
73} 72}
74 73
75const struct nf_nat_l4proto nf_nat_l4proto_icmp = { 74const struct nf_nat_protocol nf_nat_protocol_icmp = {
76 .l4proto = IPPROTO_ICMP, 75 .protonum = IPPROTO_ICMP,
76 .me = THIS_MODULE,
77 .manip_pkt = icmp_manip_pkt, 77 .manip_pkt = icmp_manip_pkt,
78 .in_range = icmp_in_range, 78 .in_range = icmp_in_range,
79 .unique_tuple = icmp_unique_tuple, 79 .unique_tuple = icmp_unique_tuple,
80#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 80#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
81 .nlattr_to_range = nf_nat_l4proto_nlattr_to_range, 81 .range_to_nlattr = nf_nat_proto_range_to_nlattr,
82 .nlattr_to_range = nf_nat_proto_nlattr_to_range,
82#endif 83#endif
83}; 84};
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index bac712293fd..076b7c8c4aa 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -400,12 +400,15 @@ static unsigned char asn1_octets_decode(struct asn1_ctx *ctx,
400 *len = 0; 400 *len = 0;
401 401
402 *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC); 402 *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC);
403 if (*octets == NULL) 403 if (*octets == NULL) {
404 if (net_ratelimit())
405 pr_notice("OOM in bsalg (%d)\n", __LINE__);
404 return 0; 406 return 0;
407 }
405 408
406 ptr = *octets; 409 ptr = *octets;
407 while (ctx->pointer < eoc) { 410 while (ctx->pointer < eoc) {
408 if (!asn1_octet_decode(ctx, ptr++)) { 411 if (!asn1_octet_decode(ctx, (unsigned char *)ptr++)) {
409 kfree(*octets); 412 kfree(*octets);
410 *octets = NULL; 413 *octets = NULL;
411 return 0; 414 return 0;
@@ -448,8 +451,11 @@ static unsigned char asn1_oid_decode(struct asn1_ctx *ctx,
448 return 0; 451 return 0;
449 452
450 *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC); 453 *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC);
451 if (*oid == NULL) 454 if (*oid == NULL) {
455 if (net_ratelimit())
456 pr_notice("OOM in bsalg (%d)\n", __LINE__);
452 return 0; 457 return 0;
458 }
453 459
454 optr = *oid; 460 optr = *oid;
455 461
@@ -722,6 +728,8 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
722 *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); 728 *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
723 if (*obj == NULL) { 729 if (*obj == NULL) {
724 kfree(id); 730 kfree(id);
731 if (net_ratelimit())
732 pr_notice("OOM in bsalg (%d)\n", __LINE__);
725 return 0; 733 return 0;
726 } 734 }
727 (*obj)->syntax.l[0] = l; 735 (*obj)->syntax.l[0] = l;
@@ -736,6 +744,8 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
736 if (*obj == NULL) { 744 if (*obj == NULL) {
737 kfree(p); 745 kfree(p);
738 kfree(id); 746 kfree(id);
747 if (net_ratelimit())
748 pr_notice("OOM in bsalg (%d)\n", __LINE__);
739 return 0; 749 return 0;
740 } 750 }
741 memcpy((*obj)->syntax.c, p, len); 751 memcpy((*obj)->syntax.c, p, len);
@@ -749,6 +759,8 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
749 *obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC); 759 *obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC);
750 if (*obj == NULL) { 760 if (*obj == NULL) {
751 kfree(id); 761 kfree(id);
762 if (net_ratelimit())
763 pr_notice("OOM in bsalg (%d)\n", __LINE__);
752 return 0; 764 return 0;
753 } 765 }
754 if (!asn1_null_decode(ctx, end)) { 766 if (!asn1_null_decode(ctx, end)) {
@@ -759,7 +771,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
759 } 771 }
760 break; 772 break;
761 case SNMP_OBJECTID: 773 case SNMP_OBJECTID:
762 if (!asn1_oid_decode(ctx, end, &lp, &len)) { 774 if (!asn1_oid_decode(ctx, end, (unsigned long **)&lp, &len)) {
763 kfree(id); 775 kfree(id);
764 return 0; 776 return 0;
765 } 777 }
@@ -768,6 +780,8 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
768 if (*obj == NULL) { 780 if (*obj == NULL) {
769 kfree(lp); 781 kfree(lp);
770 kfree(id); 782 kfree(id);
783 if (net_ratelimit())
784 pr_notice("OOM in bsalg (%d)\n", __LINE__);
771 return 0; 785 return 0;
772 } 786 }
773 memcpy((*obj)->syntax.ul, lp, len); 787 memcpy((*obj)->syntax.ul, lp, len);
@@ -787,6 +801,8 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
787 if (*obj == NULL) { 801 if (*obj == NULL) {
788 kfree(p); 802 kfree(p);
789 kfree(id); 803 kfree(id);
804 if (net_ratelimit())
805 pr_notice("OOM in bsalg (%d)\n", __LINE__);
790 return 0; 806 return 0;
791 } 807 }
792 memcpy((*obj)->syntax.uc, p, len); 808 memcpy((*obj)->syntax.uc, p, len);
@@ -803,6 +819,8 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
803 *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); 819 *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
804 if (*obj == NULL) { 820 if (*obj == NULL) {
805 kfree(id); 821 kfree(id);
822 if (net_ratelimit())
823 pr_notice("OOM in bsalg (%d)\n", __LINE__);
806 return 0; 824 return 0;
807 } 825 }
808 (*obj)->syntax.ul[0] = ul; 826 (*obj)->syntax.ul[0] = ul;
@@ -1206,7 +1224,8 @@ static int snmp_translate(struct nf_conn *ct,
1206 1224
1207 if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr), 1225 if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr),
1208 paylen, &map, &udph->check)) { 1226 paylen, &map, &udph->check)) {
1209 net_warn_ratelimited("bsalg: parser failed\n"); 1227 if (net_ratelimit())
1228 printk(KERN_WARNING "bsalg: parser failed\n");
1210 return NF_DROP; 1229 return NF_DROP;
1211 } 1230 }
1212 return NF_ACCEPT; 1231 return NF_ACCEPT;
@@ -1240,8 +1259,9 @@ static int help(struct sk_buff *skb, unsigned int protoff,
1240 * can mess around with the payload. 1259 * can mess around with the payload.
1241 */ 1260 */
1242 if (ntohs(udph->len) != skb->len - (iph->ihl << 2)) { 1261 if (ntohs(udph->len) != skb->len - (iph->ihl << 2)) {
1243 net_warn_ratelimited("SNMP: dropping malformed packet src=%pI4 dst=%pI4\n", 1262 if (net_ratelimit())
1244 &iph->saddr, &iph->daddr); 1263 printk(KERN_WARNING "SNMP: dropping malformed packet src=%pI4 dst=%pI4\n",
1264 &iph->saddr, &iph->daddr);
1245 return NF_DROP; 1265 return NF_DROP;
1246 } 1266 }
1247 1267
@@ -1290,7 +1310,7 @@ static int __init nf_nat_snmp_basic_init(void)
1290 int ret = 0; 1310 int ret = 0;
1291 1311
1292 BUG_ON(nf_nat_snmp_hook != NULL); 1312 BUG_ON(nf_nat_snmp_hook != NULL);
1293 RCU_INIT_POINTER(nf_nat_snmp_hook, help); 1313 rcu_assign_pointer(nf_nat_snmp_hook, help);
1294 1314
1295 ret = nf_conntrack_helper_register(&snmp_trap_helper); 1315 ret = nf_conntrack_helper_register(&snmp_trap_helper);
1296 if (ret < 0) { 1316 if (ret < 0) {
@@ -1302,7 +1322,7 @@ static int __init nf_nat_snmp_basic_init(void)
1302 1322
1303static void __exit nf_nat_snmp_basic_fini(void) 1323static void __exit nf_nat_snmp_basic_fini(void)
1304{ 1324{
1305 RCU_INIT_POINTER(nf_nat_snmp_hook, NULL); 1325 rcu_assign_pointer(nf_nat_snmp_hook, NULL);
1306 nf_conntrack_helper_unregister(&snmp_trap_helper); 1326 nf_conntrack_helper_unregister(&snmp_trap_helper);
1307} 1327}
1308 1328
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 8f3d05424a3..39b403f854c 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -20,6 +20,7 @@
20 * 20 *
21 */ 21 */
22 22
23#include <asm/system.h>
23#include <linux/uaccess.h> 24#include <linux/uaccess.h>
24#include <linux/types.h> 25#include <linux/types.h>
25#include <linux/fcntl.h> 26#include <linux/fcntl.h>
@@ -38,7 +39,6 @@
38#include <net/protocol.h> 39#include <net/protocol.h>
39#include <linux/skbuff.h> 40#include <linux/skbuff.h>
40#include <linux/proc_fs.h> 41#include <linux/proc_fs.h>
41#include <linux/export.h>
42#include <net/sock.h> 42#include <net/sock.h>
43#include <net/ping.h> 43#include <net/ping.h>
44#include <net/udp.h> 44#include <net/udp.h>
@@ -51,16 +51,15 @@ static struct ping_table ping_table;
51 51
52static u16 ping_port_rover; 52static u16 ping_port_rover;
53 53
54static inline int ping_hashfn(struct net *net, unsigned int num, unsigned int mask) 54static inline int ping_hashfn(struct net *net, unsigned num, unsigned mask)
55{ 55{
56 int res = (num + net_hash_mix(net)) & mask; 56 int res = (num + net_hash_mix(net)) & mask;
57
58 pr_debug("hash(%d) = %d\n", num, res); 57 pr_debug("hash(%d) = %d\n", num, res);
59 return res; 58 return res;
60} 59}
61 60
62static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table, 61static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table,
63 struct net *net, unsigned int num) 62 struct net *net, unsigned num)
64{ 63{
65 return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)]; 64 return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)];
66} 65}
@@ -140,14 +139,13 @@ static void ping_v4_unhash(struct sock *sk)
140 write_lock_bh(&ping_table.lock); 139 write_lock_bh(&ping_table.lock);
141 hlist_nulls_del(&sk->sk_nulls_node); 140 hlist_nulls_del(&sk->sk_nulls_node);
142 sock_put(sk); 141 sock_put(sk);
143 isk->inet_num = 0; 142 isk->inet_num = isk->inet_sport = 0;
144 isk->inet_sport = 0;
145 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 143 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
146 write_unlock_bh(&ping_table.lock); 144 write_unlock_bh(&ping_table.lock);
147 } 145 }
148} 146}
149 147
150static struct sock *ping_v4_lookup(struct net *net, __be32 saddr, __be32 daddr, 148static struct sock *ping_v4_lookup(struct net *net, u32 saddr, u32 daddr,
151 u16 ident, int dif) 149 u16 ident, int dif)
152{ 150{
153 struct hlist_nulls_head *hslot = ping_hashslot(&ping_table, net, ident); 151 struct hlist_nulls_head *hslot = ping_hashslot(&ping_table, net, ident);
@@ -155,15 +153,15 @@ static struct sock *ping_v4_lookup(struct net *net, __be32 saddr, __be32 daddr,
155 struct inet_sock *isk; 153 struct inet_sock *isk;
156 struct hlist_nulls_node *hnode; 154 struct hlist_nulls_node *hnode;
157 155
158 pr_debug("try to find: num = %d, daddr = %pI4, dif = %d\n", 156 pr_debug("try to find: num = %d, daddr = %ld, dif = %d\n",
159 (int)ident, &daddr, dif); 157 (int)ident, (unsigned long)daddr, dif);
160 read_lock_bh(&ping_table.lock); 158 read_lock_bh(&ping_table.lock);
161 159
162 ping_portaddr_for_each_entry(sk, hnode, hslot) { 160 ping_portaddr_for_each_entry(sk, hnode, hslot) {
163 isk = inet_sk(sk); 161 isk = inet_sk(sk);
164 162
165 pr_debug("found: %p: num = %d, daddr = %pI4, dif = %d\n", sk, 163 pr_debug("found: %p: num = %d, daddr = %ld, dif = %d\n", sk,
166 (int)isk->inet_num, &isk->inet_rcv_saddr, 164 (int)isk->inet_num, (unsigned long)isk->inet_rcv_saddr,
167 sk->sk_bound_dev_if); 165 sk->sk_bound_dev_if);
168 166
169 pr_debug("iterate\n"); 167 pr_debug("iterate\n");
@@ -185,12 +183,11 @@ exit:
185 return sk; 183 return sk;
186} 184}
187 185
188static void inet_get_ping_group_range_net(struct net *net, kgid_t *low, 186static void inet_get_ping_group_range_net(struct net *net, gid_t *low,
189 kgid_t *high) 187 gid_t *high)
190{ 188{
191 kgid_t *data = net->ipv4.sysctl_ping_group_range; 189 gid_t *data = net->ipv4.sysctl_ping_group_range;
192 unsigned int seq; 190 unsigned seq;
193
194 do { 191 do {
195 seq = read_seqbegin(&sysctl_local_ports.lock); 192 seq = read_seqbegin(&sysctl_local_ports.lock);
196 193
@@ -203,20 +200,21 @@ static void inet_get_ping_group_range_net(struct net *net, kgid_t *low,
203static int ping_init_sock(struct sock *sk) 200static int ping_init_sock(struct sock *sk)
204{ 201{
205 struct net *net = sock_net(sk); 202 struct net *net = sock_net(sk);
206 kgid_t group = current_egid(); 203 gid_t group = current_egid();
204 gid_t range[2];
207 struct group_info *group_info = get_current_groups(); 205 struct group_info *group_info = get_current_groups();
208 int i, j, count = group_info->ngroups; 206 int i, j, count = group_info->ngroups;
209 kgid_t low, high;
210 207
211 inet_get_ping_group_range_net(net, &low, &high); 208 inet_get_ping_group_range_net(net, range, range+1);
212 if (gid_lte(low, group) && gid_lte(group, high)) 209 if (range[0] <= group && group <= range[1])
213 return 0; 210 return 0;
214 211
215 for (i = 0; i < group_info->nblocks; i++) { 212 for (i = 0; i < group_info->nblocks; i++) {
216 int cp_count = min_t(int, NGROUPS_PER_BLOCK, count); 213 int cp_count = min_t(int, NGROUPS_PER_BLOCK, count);
214
217 for (j = 0; j < cp_count; j++) { 215 for (j = 0; j < cp_count; j++) {
218 kgid_t gid = group_info->blocks[i][j]; 216 group = group_info->blocks[i][j];
219 if (gid_lte(low, gid) && gid_lte(gid, high)) 217 if (range[0] <= group && group <= range[1])
220 return 0; 218 return 0;
221 } 219 }
222 220
@@ -229,7 +227,7 @@ static int ping_init_sock(struct sock *sk)
229static void ping_close(struct sock *sk, long timeout) 227static void ping_close(struct sock *sk, long timeout)
230{ 228{
231 pr_debug("ping_close(sk=%p,sk->num=%u)\n", 229 pr_debug("ping_close(sk=%p,sk->num=%u)\n",
232 inet_sk(sk), inet_sk(sk)->inet_num); 230 inet_sk(sk), inet_sk(sk)->inet_num);
233 pr_debug("isk->refcnt = %d\n", sk->sk_refcnt.counter); 231 pr_debug("isk->refcnt = %d\n", sk->sk_refcnt.counter);
234 232
235 sk_common_release(sk); 233 sk_common_release(sk);
@@ -252,10 +250,10 @@ static int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
252 return -EINVAL; 250 return -EINVAL;
253 251
254 pr_debug("ping_v4_bind(sk=%p,sa_addr=%08x,sa_port=%d)\n", 252 pr_debug("ping_v4_bind(sk=%p,sa_addr=%08x,sa_port=%d)\n",
255 sk, addr->sin_addr.s_addr, ntohs(addr->sin_port)); 253 sk, addr->sin_addr.s_addr, ntohs(addr->sin_port));
256 254
257 chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); 255 chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
258 if (addr->sin_addr.s_addr == htonl(INADDR_ANY)) 256 if (addr->sin_addr.s_addr == INADDR_ANY)
259 chk_addr_ret = RTN_LOCAL; 257 chk_addr_ret = RTN_LOCAL;
260 258
261 if ((sysctl_ip_nonlocal_bind == 0 && 259 if ((sysctl_ip_nonlocal_bind == 0 &&
@@ -279,10 +277,10 @@ static int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
279 goto out; 277 goto out;
280 } 278 }
281 279
282 pr_debug("after bind(): num = %d, daddr = %pI4, dif = %d\n", 280 pr_debug("after bind(): num = %d, daddr = %ld, dif = %d\n",
283 (int)isk->inet_num, 281 (int)isk->inet_num,
284 &isk->inet_rcv_saddr, 282 (unsigned long) isk->inet_rcv_saddr,
285 (int)sk->sk_bound_dev_if); 283 (int)sk->sk_bound_dev_if);
286 284
287 err = 0; 285 err = 0;
288 if (isk->inet_rcv_saddr) 286 if (isk->inet_rcv_saddr)
@@ -335,11 +333,12 @@ void ping_err(struct sk_buff *skb, u32 info)
335 return; 333 return;
336 334
337 pr_debug("ping_err(type=%04x,code=%04x,id=%04x,seq=%04x)\n", type, 335 pr_debug("ping_err(type=%04x,code=%04x,id=%04x,seq=%04x)\n", type,
338 code, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence)); 336 code, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence));
339 337
340 sk = ping_v4_lookup(net, iph->daddr, iph->saddr, 338 sk = ping_v4_lookup(net, iph->daddr, iph->saddr,
341 ntohs(icmph->un.echo.id), skb->dev->ifindex); 339 ntohs(icmph->un.echo.id), skb->dev->ifindex);
342 if (sk == NULL) { 340 if (sk == NULL) {
341 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
343 pr_debug("no socket, dropping\n"); 342 pr_debug("no socket, dropping\n");
344 return; /* No socket for error */ 343 return; /* No socket for error */
345 } 344 }
@@ -365,7 +364,6 @@ void ping_err(struct sk_buff *skb, u32 info)
365 break; 364 break;
366 case ICMP_DEST_UNREACH: 365 case ICMP_DEST_UNREACH:
367 if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ 366 if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
368 ipv4_sk_update_pmtu(skb, sk, info);
369 if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) { 367 if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) {
370 err = EMSGSIZE; 368 err = EMSGSIZE;
371 harderr = 1; 369 harderr = 1;
@@ -381,7 +379,6 @@ void ping_err(struct sk_buff *skb, u32 info)
381 break; 379 break;
382 case ICMP_REDIRECT: 380 case ICMP_REDIRECT:
383 /* See ICMP_SOURCE_QUENCH */ 381 /* See ICMP_SOURCE_QUENCH */
384 ipv4_sk_redirect(skb, sk);
385 err = EREMOTEIO; 382 err = EREMOTEIO;
386 break; 383 break;
387 } 384 }
@@ -410,10 +407,10 @@ out:
410struct pingfakehdr { 407struct pingfakehdr {
411 struct icmphdr icmph; 408 struct icmphdr icmph;
412 struct iovec *iov; 409 struct iovec *iov;
413 __wsum wcheck; 410 u32 wcheck;
414}; 411};
415 412
416static int ping_getfrag(void *from, char *to, 413static int ping_getfrag(void *from, char * to,
417 int offset, int fraglen, int odd, struct sk_buff *skb) 414 int offset, int fraglen, int odd, struct sk_buff *skb)
418{ 415{
419 struct pingfakehdr *pfh = (struct pingfakehdr *)from; 416 struct pingfakehdr *pfh = (struct pingfakehdr *)from;
@@ -462,7 +459,7 @@ static int ping_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
462 struct rtable *rt = NULL; 459 struct rtable *rt = NULL;
463 struct ip_options_data opt_copy; 460 struct ip_options_data opt_copy;
464 int free = 0; 461 int free = 0;
465 __be32 saddr, daddr, faddr; 462 u32 saddr, daddr, faddr;
466 u8 tos; 463 u8 tos;
467 int err; 464 int err;
468 465
@@ -558,8 +555,7 @@ static int ping_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
558 ipc.oif = inet->mc_index; 555 ipc.oif = inet->mc_index;
559 if (!saddr) 556 if (!saddr)
560 saddr = inet->mc_addr; 557 saddr = inet->mc_addr;
561 } else if (!ipc.oif) 558 }
562 ipc.oif = inet->uc_index;
563 559
564 flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, 560 flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
565 RT_SCOPE_UNIVERSE, sk->sk_protocol, 561 RT_SCOPE_UNIVERSE, sk->sk_protocol,
@@ -633,7 +629,6 @@ static int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
633 629
634 pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num); 630 pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num);
635 631
636 err = -EOPNOTSUPP;
637 if (flags & MSG_OOB) 632 if (flags & MSG_OOB)
638 goto out; 633 goto out;
639 634
@@ -681,8 +676,9 @@ out:
681static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 676static int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
682{ 677{
683 pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n", 678 pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n",
684 inet_sk(sk), inet_sk(sk)->inet_num, skb); 679 inet_sk(sk), inet_sk(sk)->inet_num, skb);
685 if (sock_queue_rcv_skb(sk, skb) < 0) { 680 if (sock_queue_rcv_skb(sk, skb) < 0) {
681 ICMP_INC_STATS_BH(sock_net(sk), ICMP_MIB_INERRORS);
686 kfree_skb(skb); 682 kfree_skb(skb);
687 pr_debug("ping_queue_rcv_skb -> failed\n"); 683 pr_debug("ping_queue_rcv_skb -> failed\n");
688 return -1; 684 return -1;
@@ -701,13 +697,13 @@ void ping_rcv(struct sk_buff *skb)
701 struct net *net = dev_net(skb->dev); 697 struct net *net = dev_net(skb->dev);
702 struct iphdr *iph = ip_hdr(skb); 698 struct iphdr *iph = ip_hdr(skb);
703 struct icmphdr *icmph = icmp_hdr(skb); 699 struct icmphdr *icmph = icmp_hdr(skb);
704 __be32 saddr = iph->saddr; 700 u32 saddr = iph->saddr;
705 __be32 daddr = iph->daddr; 701 u32 daddr = iph->daddr;
706 702
707 /* We assume the packet has already been checked by icmp_rcv */ 703 /* We assume the packet has already been checked by icmp_rcv */
708 704
709 pr_debug("ping_rcv(skb=%p,id=%04x,seq=%04x)\n", 705 pr_debug("ping_rcv(skb=%p,id=%04x,seq=%04x)\n",
710 skb, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence)); 706 skb, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence));
711 707
712 /* Push ICMP header back */ 708 /* Push ICMP header back */
713 skb_push(skb, skb->data - (u8 *)icmph); 709 skb_push(skb, skb->data - (u8 *)icmph);
@@ -839,9 +835,7 @@ static void ping_format_sock(struct sock *sp, struct seq_file *f,
839 bucket, src, srcp, dest, destp, sp->sk_state, 835 bucket, src, srcp, dest, destp, sp->sk_state,
840 sk_wmem_alloc_get(sp), 836 sk_wmem_alloc_get(sp),
841 sk_rmem_alloc_get(sp), 837 sk_rmem_alloc_get(sp),
842 0, 0L, 0, 838 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
843 from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
844 0, sock_i_ino(sp),
845 atomic_read(&sp->sk_refcnt), sp, 839 atomic_read(&sp->sk_refcnt), sp,
846 atomic_read(&sp->sk_drops), len); 840 atomic_read(&sp->sk_drops), len);
847} 841}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 8de53e1ddd5..4bfad5da94f 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -42,7 +42,6 @@
42#include <linux/inetdevice.h> 42#include <linux/inetdevice.h>
43#include <linux/proc_fs.h> 43#include <linux/proc_fs.h>
44#include <linux/seq_file.h> 44#include <linux/seq_file.h>
45#include <linux/export.h>
46#include <net/sock.h> 45#include <net/sock.h>
47#include <net/raw.h> 46#include <net/raw.h>
48 47
@@ -56,17 +55,17 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
56 55
57 local_bh_disable(); 56 local_bh_disable();
58 orphans = percpu_counter_sum_positive(&tcp_orphan_count); 57 orphans = percpu_counter_sum_positive(&tcp_orphan_count);
59 sockets = proto_sockets_allocated_sum_positive(&tcp_prot); 58 sockets = percpu_counter_sum_positive(&tcp_sockets_allocated);
60 local_bh_enable(); 59 local_bh_enable();
61 60
62 socket_seq_show(seq); 61 socket_seq_show(seq);
63 seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", 62 seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
64 sock_prot_inuse_get(net, &tcp_prot), orphans, 63 sock_prot_inuse_get(net, &tcp_prot), orphans,
65 tcp_death_row.tw_count, sockets, 64 tcp_death_row.tw_count, sockets,
66 proto_memory_allocated(&tcp_prot)); 65 atomic_long_read(&tcp_memory_allocated));
67 seq_printf(seq, "UDP: inuse %d mem %ld\n", 66 seq_printf(seq, "UDP: inuse %d mem %ld\n",
68 sock_prot_inuse_get(net, &udp_prot), 67 sock_prot_inuse_get(net, &udp_prot),
69 proto_memory_allocated(&udp_prot)); 68 atomic_long_read(&udp_memory_allocated));
70 seq_printf(seq, "UDPLITE: inuse %d\n", 69 seq_printf(seq, "UDPLITE: inuse %d\n",
71 sock_prot_inuse_get(net, &udplite_prot)); 70 sock_prot_inuse_get(net, &udplite_prot));
72 seq_printf(seq, "RAW: inuse %d\n", 71 seq_printf(seq, "RAW: inuse %d\n",
@@ -216,6 +215,7 @@ static const struct snmp_mib snmp4_net_list[] = {
216 SNMP_MIB_ITEM("TCPPartialUndo", LINUX_MIB_TCPPARTIALUNDO), 215 SNMP_MIB_ITEM("TCPPartialUndo", LINUX_MIB_TCPPARTIALUNDO),
217 SNMP_MIB_ITEM("TCPDSACKUndo", LINUX_MIB_TCPDSACKUNDO), 216 SNMP_MIB_ITEM("TCPDSACKUndo", LINUX_MIB_TCPDSACKUNDO),
218 SNMP_MIB_ITEM("TCPLossUndo", LINUX_MIB_TCPLOSSUNDO), 217 SNMP_MIB_ITEM("TCPLossUndo", LINUX_MIB_TCPLOSSUNDO),
218 SNMP_MIB_ITEM("TCPLoss", LINUX_MIB_TCPLOSS),
219 SNMP_MIB_ITEM("TCPLostRetransmit", LINUX_MIB_TCPLOSTRETRANSMIT), 219 SNMP_MIB_ITEM("TCPLostRetransmit", LINUX_MIB_TCPLOSTRETRANSMIT),
220 SNMP_MIB_ITEM("TCPRenoFailures", LINUX_MIB_TCPRENOFAILURES), 220 SNMP_MIB_ITEM("TCPRenoFailures", LINUX_MIB_TCPRENOFAILURES),
221 SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES), 221 SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES),
@@ -232,6 +232,7 @@ static const struct snmp_mib snmp4_net_list[] = {
232 SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT), 232 SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
233 SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV), 233 SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV),
234 SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV), 234 SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV),
235 SNMP_MIB_ITEM("TCPAbortOnSyn", LINUX_MIB_TCPABORTONSYN),
235 SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA), 236 SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA),
236 SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE), 237 SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE),
237 SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY), 238 SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY),
@@ -255,18 +256,6 @@ static const struct snmp_mib snmp4_net_list[] = {
255 SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW), 256 SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW),
256 SNMP_MIB_ITEM("TCPReqQFullDoCookies", LINUX_MIB_TCPREQQFULLDOCOOKIES), 257 SNMP_MIB_ITEM("TCPReqQFullDoCookies", LINUX_MIB_TCPREQQFULLDOCOOKIES),
257 SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP), 258 SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP),
258 SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL),
259 SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE),
260 SNMP_MIB_ITEM("TCPOFOQueue", LINUX_MIB_TCPOFOQUEUE),
261 SNMP_MIB_ITEM("TCPOFODrop", LINUX_MIB_TCPOFODROP),
262 SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE),
263 SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK),
264 SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE),
265 SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE),
266 SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE),
267 SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL),
268 SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW),
269 SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD),
270 SNMP_MIB_SENTINEL 259 SNMP_MIB_SENTINEL
271}; 260};
272 261
@@ -298,7 +287,7 @@ static void icmpmsg_put(struct seq_file *seq)
298 287
299 count = 0; 288 count = 0;
300 for (i = 0; i < ICMPMSG_MIB_MAX; i++) { 289 for (i = 0; i < ICMPMSG_MIB_MAX; i++) {
301 val = atomic_long_read(&net->mib.icmpmsg_statistics->mibs[i]); 290 val = snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, i);
302 if (val) { 291 if (val) {
303 type[count] = i; 292 type[count] = i;
304 vals[count++] = val; 293 vals[count++] = val;
@@ -317,7 +306,6 @@ static void icmp_put(struct seq_file *seq)
317{ 306{
318 int i; 307 int i;
319 struct net *net = seq->private; 308 struct net *net = seq->private;
320 atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs;
321 309
322 seq_puts(seq, "\nIcmp: InMsgs InErrors"); 310 seq_puts(seq, "\nIcmp: InMsgs InErrors");
323 for (i=0; icmpmibmap[i].name != NULL; i++) 311 for (i=0; icmpmibmap[i].name != NULL; i++)
@@ -330,13 +318,15 @@ static void icmp_put(struct seq_file *seq)
330 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS)); 318 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS));
331 for (i=0; icmpmibmap[i].name != NULL; i++) 319 for (i=0; icmpmibmap[i].name != NULL; i++)
332 seq_printf(seq, " %lu", 320 seq_printf(seq, " %lu",
333 atomic_long_read(ptr + icmpmibmap[i].index)); 321 snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics,
322 icmpmibmap[i].index));
334 seq_printf(seq, " %lu %lu", 323 seq_printf(seq, " %lu %lu",
335 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), 324 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
336 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); 325 snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS));
337 for (i=0; icmpmibmap[i].name != NULL; i++) 326 for (i=0; icmpmibmap[i].name != NULL; i++)
338 seq_printf(seq, " %lu", 327 seq_printf(seq, " %lu",
339 atomic_long_read(ptr + (icmpmibmap[i].index | 0x100))); 328 snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics,
329 icmpmibmap[i].index | 0x100));
340} 330}
341 331
342/* 332/*
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 0f9d09f54bd..9ae5c01cd0b 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -29,7 +29,6 @@
29#include <net/protocol.h> 29#include <net/protocol.h>
30 30
31const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; 31const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
32const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;
33 32
34/* 33/*
35 * Add a protocol handler to the hash tables 34 * Add a protocol handler to the hash tables
@@ -37,17 +36,12 @@ const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;
37 36
38int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) 37int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
39{ 38{
40 return !cmpxchg((const struct net_protocol **)&inet_protos[protocol], 39 int hash = protocol & (MAX_INET_PROTOS - 1);
41 NULL, prot) ? 0 : -1;
42}
43EXPORT_SYMBOL(inet_add_protocol);
44 40
45int inet_add_offload(const struct net_offload *prot, unsigned char protocol) 41 return !cmpxchg((const struct net_protocol **)&inet_protos[hash],
46{
47 return !cmpxchg((const struct net_offload **)&inet_offloads[protocol],
48 NULL, prot) ? 0 : -1; 42 NULL, prot) ? 0 : -1;
49} 43}
50EXPORT_SYMBOL(inet_add_offload); 44EXPORT_SYMBOL(inet_add_protocol);
51 45
52/* 46/*
53 * Remove a protocol from the hash tables. 47 * Remove a protocol from the hash tables.
@@ -55,9 +49,9 @@ EXPORT_SYMBOL(inet_add_offload);
55 49
56int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol) 50int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
57{ 51{
58 int ret; 52 int ret, hash = protocol & (MAX_INET_PROTOS - 1);
59 53
60 ret = (cmpxchg((const struct net_protocol **)&inet_protos[protocol], 54 ret = (cmpxchg((const struct net_protocol **)&inet_protos[hash],
61 prot, NULL) == prot) ? 0 : -1; 55 prot, NULL) == prot) ? 0 : -1;
62 56
63 synchronize_net(); 57 synchronize_net();
@@ -65,16 +59,3 @@ int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
65 return ret; 59 return ret;
66} 60}
67EXPORT_SYMBOL(inet_del_protocol); 61EXPORT_SYMBOL(inet_del_protocol);
68
69int inet_del_offload(const struct net_offload *prot, unsigned char protocol)
70{
71 int ret;
72
73 ret = (cmpxchg((const struct net_offload **)&inet_offloads[protocol],
74 prot, NULL) == prot) ? 0 : -1;
75
76 synchronize_net();
77
78 return ret;
79}
80EXPORT_SYMBOL(inet_del_offload);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 73d1e4df4bf..61714bd5292 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -48,7 +48,6 @@
48#include <linux/errno.h> 48#include <linux/errno.h>
49#include <linux/aio.h> 49#include <linux/aio.h>
50#include <linux/kernel.h> 50#include <linux/kernel.h>
51#include <linux/export.h>
52#include <linux/spinlock.h> 51#include <linux/spinlock.h>
53#include <linux/sockios.h> 52#include <linux/sockios.h>
54#include <linux/socket.h> 53#include <linux/socket.h>
@@ -131,20 +130,18 @@ found:
131 * 0 - deliver 130 * 0 - deliver
132 * 1 - block 131 * 1 - block
133 */ 132 */
134static int icmp_filter(const struct sock *sk, const struct sk_buff *skb) 133static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
135{ 134{
136 struct icmphdr _hdr; 135 int type;
137 const struct icmphdr *hdr;
138 136
139 hdr = skb_header_pointer(skb, skb_transport_offset(skb), 137 if (!pskb_may_pull(skb, sizeof(struct icmphdr)))
140 sizeof(_hdr), &_hdr);
141 if (!hdr)
142 return 1; 138 return 1;
143 139
144 if (hdr->type < 32) { 140 type = icmp_hdr(skb)->type;
141 if (type < 32) {
145 __u32 data = raw_sk(sk)->filter.data; 142 __u32 data = raw_sk(sk)->filter.data;
146 143
147 return ((1U << hdr->type) & data) != 0; 144 return ((1 << type) & data) != 0;
148 } 145 }
149 146
150 /* Do not block unknown ICMP types */ 147 /* Do not block unknown ICMP types */
@@ -218,11 +215,6 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
218 int err = 0; 215 int err = 0;
219 int harderr = 0; 216 int harderr = 0;
220 217
221 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
222 ipv4_sk_update_pmtu(skb, sk, info);
223 else if (type == ICMP_REDIRECT)
224 ipv4_sk_redirect(skb, sk);
225
226 /* Report error on raw socket, if: 218 /* Report error on raw socket, if:
227 1. User requested ip_recverr. 219 1. User requested ip_recverr.
228 2. Socket is connected (otherwise the error indication 220 2. Socket is connected (otherwise the error indication
@@ -295,12 +287,11 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
295 read_unlock(&raw_v4_hashinfo.lock); 287 read_unlock(&raw_v4_hashinfo.lock);
296} 288}
297 289
298static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb) 290static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
299{ 291{
300 /* Charge it to the socket. */ 292 /* Charge it to the socket. */
301 293
302 ipv4_pktinfo_prepare(skb); 294 if (ip_queue_rcv_skb(sk, skb) < 0) {
303 if (sock_queue_rcv_skb(sk, skb) < 0) {
304 kfree_skb(skb); 295 kfree_skb(skb);
305 return NET_RX_DROP; 296 return NET_RX_DROP;
306 } 297 }
@@ -335,7 +326,6 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
335 unsigned int iphlen; 326 unsigned int iphlen;
336 int err; 327 int err;
337 struct rtable *rt = *rtp; 328 struct rtable *rt = *rtp;
338 int hlen, tlen;
339 329
340 if (length > rt->dst.dev->mtu) { 330 if (length > rt->dst.dev->mtu) {
341 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, 331 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
@@ -345,14 +335,12 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
345 if (flags&MSG_PROBE) 335 if (flags&MSG_PROBE)
346 goto out; 336 goto out;
347 337
348 hlen = LL_RESERVED_SPACE(rt->dst.dev);
349 tlen = rt->dst.dev->needed_tailroom;
350 skb = sock_alloc_send_skb(sk, 338 skb = sock_alloc_send_skb(sk,
351 length + hlen + tlen + 15, 339 length + LL_ALLOCATED_SPACE(rt->dst.dev) + 15,
352 flags & MSG_DONTWAIT, &err); 340 flags & MSG_DONTWAIT, &err);
353 if (skb == NULL) 341 if (skb == NULL)
354 goto error; 342 goto error;
355 skb_reserve(skb, hlen); 343 skb_reserve(skb, LL_RESERVED_SPACE(rt->dst.dev));
356 344
357 skb->priority = sk->sk_priority; 345 skb->priority = sk->sk_priority;
358 skb->mark = sk->sk_mark; 346 skb->mark = sk->sk_mark;
@@ -498,8 +486,11 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
498 if (msg->msg_namelen < sizeof(*usin)) 486 if (msg->msg_namelen < sizeof(*usin))
499 goto out; 487 goto out;
500 if (usin->sin_family != AF_INET) { 488 if (usin->sin_family != AF_INET) {
501 pr_info_once("%s: %s forgot to set AF_INET. Fix it!\n", 489 static int complained;
502 __func__, current->comm); 490 if (!complained++)
491 printk(KERN_INFO "%s forgot to set AF_INET in "
492 "raw sendmsg. Fix it!\n",
493 current->comm);
503 err = -EAFNOSUPPORT; 494 err = -EAFNOSUPPORT;
504 if (usin->sin_family) 495 if (usin->sin_family)
505 goto out; 496 goto out;
@@ -567,8 +558,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
567 ipc.oif = inet->mc_index; 558 ipc.oif = inet->mc_index;
568 if (!saddr) 559 if (!saddr)
569 saddr = inet->mc_addr; 560 saddr = inet->mc_addr;
570 } else if (!ipc.oif) 561 }
571 ipc.oif = inet->uc_index;
572 562
573 flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, 563 flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
574 RT_SCOPE_UNIVERSE, 564 RT_SCOPE_UNIVERSE,
@@ -994,9 +984,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
994 i, src, srcp, dest, destp, sp->sk_state, 984 i, src, srcp, dest, destp, sp->sk_state,
995 sk_wmem_alloc_get(sp), 985 sk_wmem_alloc_get(sp),
996 sk_rmem_alloc_get(sp), 986 sk_rmem_alloc_get(sp),
997 0, 0L, 0, 987 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
998 from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)),
999 0, sock_i_ino(sp),
1000 atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); 988 atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops));
1001} 989}
1002 990
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 844a9ef60db..b5638545deb 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -62,14 +62,14 @@
62 * 2 of the License, or (at your option) any later version. 62 * 2 of the License, or (at your option) any later version.
63 */ 63 */
64 64
65#define pr_fmt(fmt) "IPv4: " fmt
66
67#include <linux/module.h> 65#include <linux/module.h>
68#include <asm/uaccess.h> 66#include <asm/uaccess.h>
67#include <asm/system.h>
69#include <linux/bitops.h> 68#include <linux/bitops.h>
70#include <linux/types.h> 69#include <linux/types.h>
71#include <linux/kernel.h> 70#include <linux/kernel.h>
72#include <linux/mm.h> 71#include <linux/mm.h>
72#include <linux/bootmem.h>
73#include <linux/string.h> 73#include <linux/string.h>
74#include <linux/socket.h> 74#include <linux/socket.h>
75#include <linux/sockios.h> 75#include <linux/sockios.h>
@@ -79,6 +79,7 @@
79#include <linux/netdevice.h> 79#include <linux/netdevice.h>
80#include <linux/proc_fs.h> 80#include <linux/proc_fs.h>
81#include <linux/init.h> 81#include <linux/init.h>
82#include <linux/workqueue.h>
82#include <linux/skbuff.h> 83#include <linux/skbuff.h>
83#include <linux/inetdevice.h> 84#include <linux/inetdevice.h>
84#include <linux/igmp.h> 85#include <linux/igmp.h>
@@ -86,9 +87,11 @@
86#include <linux/mroute.h> 87#include <linux/mroute.h>
87#include <linux/netfilter_ipv4.h> 88#include <linux/netfilter_ipv4.h>
88#include <linux/random.h> 89#include <linux/random.h>
90#include <linux/jhash.h>
89#include <linux/rcupdate.h> 91#include <linux/rcupdate.h>
90#include <linux/times.h> 92#include <linux/times.h>
91#include <linux/slab.h> 93#include <linux/slab.h>
94#include <linux/prefetch.h>
92#include <net/dst.h> 95#include <net/dst.h>
93#include <net/net_namespace.h> 96#include <net/net_namespace.h>
94#include <net/protocol.h> 97#include <net/protocol.h>
@@ -105,8 +108,8 @@
105#include <net/rtnetlink.h> 108#include <net/rtnetlink.h>
106#ifdef CONFIG_SYSCTL 109#ifdef CONFIG_SYSCTL
107#include <linux/sysctl.h> 110#include <linux/sysctl.h>
108#include <linux/kmemleak.h>
109#endif 111#endif
112#include <net/atmclip.h>
110#include <net/secure_seq.h> 113#include <net/secure_seq.h>
111 114
112#define RT_FL_TOS(oldflp4) \ 115#define RT_FL_TOS(oldflp4) \
@@ -118,7 +121,7 @@
118 121
119static int ip_rt_max_size; 122static int ip_rt_max_size;
120static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; 123static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
121static int ip_rt_gc_interval __read_mostly = 60 * HZ; 124static int ip_rt_gc_interval __read_mostly = 60 * HZ;
122static int ip_rt_gc_min_interval __read_mostly = HZ / 2; 125static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
123static int ip_rt_redirect_number __read_mostly = 9; 126static int ip_rt_redirect_number __read_mostly = 9;
124static int ip_rt_redirect_load __read_mostly = HZ / 50; 127static int ip_rt_redirect_load __read_mostly = HZ / 50;
@@ -129,6 +132,11 @@ static int ip_rt_gc_elasticity __read_mostly = 8;
129static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; 132static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
130static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; 133static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131static int ip_rt_min_advmss __read_mostly = 256; 134static int ip_rt_min_advmss __read_mostly = 256;
135static int rt_chain_length_max __read_mostly = 20;
136static int redirect_genid;
137
138static struct delayed_work expires_work;
139static unsigned long expires_ljiffies;
132 140
133/* 141/*
134 * Interface to generic destination cache. 142 * Interface to generic destination cache.
@@ -136,14 +144,12 @@ static int ip_rt_min_advmss __read_mostly = 256;
136 144
137static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); 145static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138static unsigned int ipv4_default_advmss(const struct dst_entry *dst); 146static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
139static unsigned int ipv4_mtu(const struct dst_entry *dst); 147static unsigned int ipv4_default_mtu(const struct dst_entry *dst);
148static void ipv4_dst_destroy(struct dst_entry *dst);
140static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); 149static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141static void ipv4_link_failure(struct sk_buff *skb); 150static void ipv4_link_failure(struct sk_buff *skb);
142static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 151static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
143 struct sk_buff *skb, u32 mtu); 152static int rt_garbage_collect(struct dst_ops *ops);
144static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145 struct sk_buff *skb);
146static void ipv4_dst_destroy(struct dst_entry *dst);
147 153
148static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 154static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
149 int how) 155 int how)
@@ -152,27 +158,54 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
152 158
153static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) 159static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154{ 160{
155 WARN_ON(1); 161 struct rtable *rt = (struct rtable *) dst;
156 return NULL; 162 struct inet_peer *peer;
163 u32 *p = NULL;
164
165 if (!rt->peer)
166 rt_bind_peer(rt, rt->rt_dst, 1);
167
168 peer = rt->peer;
169 if (peer) {
170 u32 *old_p = __DST_METRICS_PTR(old);
171 unsigned long prev, new;
172
173 p = peer->metrics;
174 if (inet_metrics_new(peer))
175 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
176
177 new = (unsigned long) p;
178 prev = cmpxchg(&dst->_metrics, old, new);
179
180 if (prev != old) {
181 p = __DST_METRICS_PTR(prev);
182 if (prev & DST_METRICS_READ_ONLY)
183 p = NULL;
184 } else {
185 if (rt->fi) {
186 fib_info_put(rt->fi);
187 rt->fi = NULL;
188 }
189 }
190 }
191 return p;
157} 192}
158 193
159static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, 194static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
160 struct sk_buff *skb,
161 const void *daddr);
162 195
163static struct dst_ops ipv4_dst_ops = { 196static struct dst_ops ipv4_dst_ops = {
164 .family = AF_INET, 197 .family = AF_INET,
165 .protocol = cpu_to_be16(ETH_P_IP), 198 .protocol = cpu_to_be16(ETH_P_IP),
199 .gc = rt_garbage_collect,
166 .check = ipv4_dst_check, 200 .check = ipv4_dst_check,
167 .default_advmss = ipv4_default_advmss, 201 .default_advmss = ipv4_default_advmss,
168 .mtu = ipv4_mtu, 202 .default_mtu = ipv4_default_mtu,
169 .cow_metrics = ipv4_cow_metrics, 203 .cow_metrics = ipv4_cow_metrics,
170 .destroy = ipv4_dst_destroy, 204 .destroy = ipv4_dst_destroy,
171 .ifdown = ipv4_dst_ifdown, 205 .ifdown = ipv4_dst_ifdown,
172 .negative_advice = ipv4_negative_advice, 206 .negative_advice = ipv4_negative_advice,
173 .link_failure = ipv4_link_failure, 207 .link_failure = ipv4_link_failure,
174 .update_pmtu = ip_rt_update_pmtu, 208 .update_pmtu = ip_rt_update_pmtu,
175 .redirect = ip_do_redirect,
176 .local_out = __ip_local_out, 209 .local_out = __ip_local_out,
177 .neigh_lookup = ipv4_neigh_lookup, 210 .neigh_lookup = ipv4_neigh_lookup,
178}; 211};
@@ -197,27 +230,186 @@ const __u8 ip_tos2prio[16] = {
197 TC_PRIO_INTERACTIVE_BULK, 230 TC_PRIO_INTERACTIVE_BULK,
198 ECN_OR_COST(INTERACTIVE_BULK) 231 ECN_OR_COST(INTERACTIVE_BULK)
199}; 232};
200EXPORT_SYMBOL(ip_tos2prio); 233
234
235/*
236 * Route cache.
237 */
238
239/* The locking scheme is rather straight forward:
240 *
241 * 1) Read-Copy Update protects the buckets of the central route hash.
242 * 2) Only writers remove entries, and they hold the lock
243 * as they look at rtable reference counts.
244 * 3) Only readers acquire references to rtable entries,
245 * they do so with atomic increments and with the
246 * lock held.
247 */
248
249struct rt_hash_bucket {
250 struct rtable __rcu *chain;
251};
252
253#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
254 defined(CONFIG_PROVE_LOCKING)
255/*
256 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
257 * The size of this table is a power of two and depends on the number of CPUS.
258 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
259 */
260#ifdef CONFIG_LOCKDEP
261# define RT_HASH_LOCK_SZ 256
262#else
263# if NR_CPUS >= 32
264# define RT_HASH_LOCK_SZ 4096
265# elif NR_CPUS >= 16
266# define RT_HASH_LOCK_SZ 2048
267# elif NR_CPUS >= 8
268# define RT_HASH_LOCK_SZ 1024
269# elif NR_CPUS >= 4
270# define RT_HASH_LOCK_SZ 512
271# else
272# define RT_HASH_LOCK_SZ 256
273# endif
274#endif
275
276static spinlock_t *rt_hash_locks;
277# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
278
279static __init void rt_hash_lock_init(void)
280{
281 int i;
282
283 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
284 GFP_KERNEL);
285 if (!rt_hash_locks)
286 panic("IP: failed to allocate rt_hash_locks\n");
287
288 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
289 spin_lock_init(&rt_hash_locks[i]);
290}
291#else
292# define rt_hash_lock_addr(slot) NULL
293
294static inline void rt_hash_lock_init(void)
295{
296}
297#endif
298
299static struct rt_hash_bucket *rt_hash_table __read_mostly;
300static unsigned rt_hash_mask __read_mostly;
301static unsigned int rt_hash_log __read_mostly;
201 302
202static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); 303static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
203#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) 304#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
204 305
306static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
307 int genid)
308{
309 return jhash_3words((__force u32)daddr, (__force u32)saddr,
310 idx, genid)
311 & rt_hash_mask;
312}
313
314static inline int rt_genid(struct net *net)
315{
316 return atomic_read(&net->ipv4.rt_genid);
317}
318
205#ifdef CONFIG_PROC_FS 319#ifdef CONFIG_PROC_FS
320struct rt_cache_iter_state {
321 struct seq_net_private p;
322 int bucket;
323 int genid;
324};
325
326static struct rtable *rt_cache_get_first(struct seq_file *seq)
327{
328 struct rt_cache_iter_state *st = seq->private;
329 struct rtable *r = NULL;
330
331 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
332 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
333 continue;
334 rcu_read_lock_bh();
335 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
336 while (r) {
337 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
338 r->rt_genid == st->genid)
339 return r;
340 r = rcu_dereference_bh(r->dst.rt_next);
341 }
342 rcu_read_unlock_bh();
343 }
344 return r;
345}
346
347static struct rtable *__rt_cache_get_next(struct seq_file *seq,
348 struct rtable *r)
349{
350 struct rt_cache_iter_state *st = seq->private;
351
352 r = rcu_dereference_bh(r->dst.rt_next);
353 while (!r) {
354 rcu_read_unlock_bh();
355 do {
356 if (--st->bucket < 0)
357 return NULL;
358 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
359 rcu_read_lock_bh();
360 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
361 }
362 return r;
363}
364
365static struct rtable *rt_cache_get_next(struct seq_file *seq,
366 struct rtable *r)
367{
368 struct rt_cache_iter_state *st = seq->private;
369 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
370 if (dev_net(r->dst.dev) != seq_file_net(seq))
371 continue;
372 if (r->rt_genid == st->genid)
373 break;
374 }
375 return r;
376}
377
378static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
379{
380 struct rtable *r = rt_cache_get_first(seq);
381
382 if (r)
383 while (pos && (r = rt_cache_get_next(seq, r)))
384 --pos;
385 return pos ? NULL : r;
386}
387
206static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) 388static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
207{ 389{
390 struct rt_cache_iter_state *st = seq->private;
208 if (*pos) 391 if (*pos)
209 return NULL; 392 return rt_cache_get_idx(seq, *pos - 1);
393 st->genid = rt_genid(seq_file_net(seq));
210 return SEQ_START_TOKEN; 394 return SEQ_START_TOKEN;
211} 395}
212 396
213static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) 397static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
214{ 398{
399 struct rtable *r;
400
401 if (v == SEQ_START_TOKEN)
402 r = rt_cache_get_first(seq);
403 else
404 r = rt_cache_get_next(seq, v);
215 ++*pos; 405 ++*pos;
216 return NULL; 406 return r;
217} 407}
218 408
219static void rt_cache_seq_stop(struct seq_file *seq, void *v) 409static void rt_cache_seq_stop(struct seq_file *seq, void *v)
220{ 410{
411 if (v && v != SEQ_START_TOKEN)
412 rcu_read_unlock_bh();
221} 413}
222 414
223static int rt_cache_seq_show(struct seq_file *seq, void *v) 415static int rt_cache_seq_show(struct seq_file *seq, void *v)
@@ -227,6 +419,34 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
227 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" 419 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
228 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" 420 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
229 "HHUptod\tSpecDst"); 421 "HHUptod\tSpecDst");
422 else {
423 struct rtable *r = v;
424 struct neighbour *n;
425 int len, HHUptod;
426
427 rcu_read_lock();
428 n = dst_get_neighbour(&r->dst);
429 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
430 rcu_read_unlock();
431
432 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
433 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
434 r->dst.dev ? r->dst.dev->name : "*",
435 (__force u32)r->rt_dst,
436 (__force u32)r->rt_gateway,
437 r->rt_flags, atomic_read(&r->dst.__refcnt),
438 r->dst.__use, 0, (__force u32)r->rt_src,
439 dst_metric_advmss(&r->dst) + 40,
440 dst_metric(&r->dst, RTAX_WINDOW),
441 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
442 dst_metric(&r->dst, RTAX_RTTVAR)),
443 r->rt_key_tos,
444 -1,
445 HHUptod,
446 r->rt_spec_dst, &len);
447
448 seq_printf(seq, "%*s\n", 127 - len, "");
449 }
230 return 0; 450 return 0;
231} 451}
232 452
@@ -239,7 +459,8 @@ static const struct seq_operations rt_cache_seq_ops = {
239 459
240static int rt_cache_seq_open(struct inode *inode, struct file *file) 460static int rt_cache_seq_open(struct inode *inode, struct file *file)
241{ 461{
242 return seq_open(file, &rt_cache_seq_ops); 462 return seq_open_net(inode, file, &rt_cache_seq_ops,
463 sizeof(struct rt_cache_iter_state));
243} 464}
244 465
245static const struct file_operations rt_cache_seq_fops = { 466static const struct file_operations rt_cache_seq_fops = {
@@ -247,7 +468,7 @@ static const struct file_operations rt_cache_seq_fops = {
247 .open = rt_cache_seq_open, 468 .open = rt_cache_seq_open,
248 .read = seq_read, 469 .read = seq_read,
249 .llseek = seq_lseek, 470 .llseek = seq_lseek,
250 .release = seq_release, 471 .release = seq_release_net,
251}; 472};
252 473
253 474
@@ -437,252 +658,791 @@ static inline int ip_rt_proc_init(void)
437} 658}
438#endif /* CONFIG_PROC_FS */ 659#endif /* CONFIG_PROC_FS */
439 660
440static inline bool rt_is_expired(const struct rtable *rth) 661static inline void rt_free(struct rtable *rt)
441{ 662{
442 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); 663 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
443} 664}
444 665
445void rt_cache_flush(struct net *net) 666static inline void rt_drop(struct rtable *rt)
446{ 667{
447 rt_genid_bump(net); 668 ip_rt_put(rt);
669 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
448} 670}
449 671
450static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, 672static inline int rt_fast_clean(struct rtable *rth)
451 struct sk_buff *skb,
452 const void *daddr)
453{ 673{
454 struct net_device *dev = dst->dev; 674 /* Kill broadcast/multicast entries very aggresively, if they
455 const __be32 *pkey = daddr; 675 collide in hash table with more useful entries */
456 const struct rtable *rt; 676 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
457 struct neighbour *n; 677 rt_is_input_route(rth) && rth->dst.rt_next;
678}
458 679
459 rt = (const struct rtable *) dst; 680static inline int rt_valuable(struct rtable *rth)
460 if (rt->rt_gateway) 681{
461 pkey = (const __be32 *) &rt->rt_gateway; 682 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
462 else if (skb) 683 (rth->peer && rth->peer->pmtu_expires);
463 pkey = &ip_hdr(skb)->daddr; 684}
464 685
465 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey); 686static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
466 if (n) 687{
467 return n; 688 unsigned long age;
468 return neigh_create(&arp_tbl, pkey, dev); 689 int ret = 0;
690
691 if (atomic_read(&rth->dst.__refcnt))
692 goto out;
693
694 age = jiffies - rth->dst.lastuse;
695 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
696 (age <= tmo2 && rt_valuable(rth)))
697 goto out;
698 ret = 1;
699out: return ret;
469} 700}
470 701
471/* 702/* Bits of score are:
472 * Peer allocation may fail only in serious out-of-memory conditions. However 703 * 31: very valuable
473 * we still can generate some output. 704 * 30: not quite useless
474 * Random ID selection looks a bit dangerous because we have no chances to 705 * 29..0: usage counter
475 * select ID being unique in a reasonable period of time.
476 * But broken packet identifier may be better than no packet at all.
477 */ 706 */
478static void ip_select_fb_ident(struct iphdr *iph) 707static inline u32 rt_score(struct rtable *rt)
479{ 708{
480 static DEFINE_SPINLOCK(ip_fb_id_lock); 709 u32 score = jiffies - rt->dst.lastuse;
481 static u32 ip_fallback_id;
482 u32 salt;
483 710
484 spin_lock_bh(&ip_fb_id_lock); 711 score = ~score & ~(3<<30);
485 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr); 712
486 iph->id = htons(salt & 0xFFFF); 713 if (rt_valuable(rt))
487 ip_fallback_id = salt; 714 score |= (1<<31);
488 spin_unlock_bh(&ip_fb_id_lock); 715
716 if (rt_is_output_route(rt) ||
717 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
718 score |= (1<<30);
719
720 return score;
489} 721}
490 722
491void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) 723static inline bool rt_caching(const struct net *net)
492{ 724{
493 struct net *net = dev_net(dst->dev); 725 return net->ipv4.current_rt_cache_rebuild_count <=
494 struct inet_peer *peer; 726 net->ipv4.sysctl_rt_cache_rebuild_count;
727}
495 728
496 peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1); 729static inline bool compare_hash_inputs(const struct rtable *rt1,
497 if (peer) { 730 const struct rtable *rt2)
498 iph->id = htons(inet_getid(peer, more)); 731{
499 inet_putpeer(peer); 732 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
500 return; 733 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
501 } 734 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
735}
502 736
503 ip_select_fb_ident(iph); 737static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
738{
739 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
740 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
741 (rt1->rt_mark ^ rt2->rt_mark) |
742 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
743 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
744 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
504} 745}
505EXPORT_SYMBOL(__ip_select_ident);
506 746
507static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk, 747static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
508 const struct iphdr *iph, 748{
509 int oif, u8 tos, 749 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
510 u8 prot, u32 mark, int flow_flags) 750}
751
752static inline int rt_is_expired(struct rtable *rth)
753{
754 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
755}
756
757/*
758 * Perform a full scan of hash table and free all entries.
759 * Can be called by a softirq or a process.
760 * In the later case, we want to be reschedule if necessary
761 */
762static void rt_do_flush(struct net *net, int process_context)
763{
764 unsigned int i;
765 struct rtable *rth, *next;
766
767 for (i = 0; i <= rt_hash_mask; i++) {
768 struct rtable __rcu **pprev;
769 struct rtable *list;
770
771 if (process_context && need_resched())
772 cond_resched();
773 rth = rcu_dereference_raw(rt_hash_table[i].chain);
774 if (!rth)
775 continue;
776
777 spin_lock_bh(rt_hash_lock_addr(i));
778
779 list = NULL;
780 pprev = &rt_hash_table[i].chain;
781 rth = rcu_dereference_protected(*pprev,
782 lockdep_is_held(rt_hash_lock_addr(i)));
783
784 while (rth) {
785 next = rcu_dereference_protected(rth->dst.rt_next,
786 lockdep_is_held(rt_hash_lock_addr(i)));
787
788 if (!net ||
789 net_eq(dev_net(rth->dst.dev), net)) {
790 rcu_assign_pointer(*pprev, next);
791 rcu_assign_pointer(rth->dst.rt_next, list);
792 list = rth;
793 } else {
794 pprev = &rth->dst.rt_next;
795 }
796 rth = next;
797 }
798
799 spin_unlock_bh(rt_hash_lock_addr(i));
800
801 for (; list; list = next) {
802 next = rcu_dereference_protected(list->dst.rt_next, 1);
803 rt_free(list);
804 }
805 }
806}
807
808/*
809 * While freeing expired entries, we compute average chain length
810 * and standard deviation, using fixed-point arithmetic.
811 * This to have an estimation of rt_chain_length_max
812 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
813 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
814 */
815
816#define FRACT_BITS 3
817#define ONE (1UL << FRACT_BITS)
818
819/*
820 * Given a hash chain and an item in this hash chain,
821 * find if a previous entry has the same hash_inputs
822 * (but differs on tos, mark or oif)
823 * Returns 0 if an alias is found.
824 * Returns ONE if rth has no alias before itself.
825 */
826static int has_noalias(const struct rtable *head, const struct rtable *rth)
511{ 827{
512 if (sk) { 828 const struct rtable *aux = head;
513 const struct inet_sock *inet = inet_sk(sk);
514 829
515 oif = sk->sk_bound_dev_if; 830 while (aux != rth) {
516 mark = sk->sk_mark; 831 if (compare_hash_inputs(aux, rth))
517 tos = RT_CONN_FLAGS(sk); 832 return 0;
518 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol; 833 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
519 } 834 }
520 flowi4_init_output(fl4, oif, mark, tos, 835 return ONE;
521 RT_SCOPE_UNIVERSE, prot,
522 flow_flags,
523 iph->daddr, iph->saddr, 0, 0);
524} 836}
525 837
526static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, 838static void rt_check_expire(void)
527 const struct sock *sk)
528{ 839{
529 const struct iphdr *iph = ip_hdr(skb); 840 static unsigned int rover;
530 int oif = skb->dev->ifindex; 841 unsigned int i = rover, goal;
531 u8 tos = RT_TOS(iph->tos); 842 struct rtable *rth;
532 u8 prot = iph->protocol; 843 struct rtable __rcu **rthp;
533 u32 mark = skb->mark; 844 unsigned long samples = 0;
845 unsigned long sum = 0, sum2 = 0;
846 unsigned long delta;
847 u64 mult;
848
849 delta = jiffies - expires_ljiffies;
850 expires_ljiffies = jiffies;
851 mult = ((u64)delta) << rt_hash_log;
852 if (ip_rt_gc_timeout > 1)
853 do_div(mult, ip_rt_gc_timeout);
854 goal = (unsigned int)mult;
855 if (goal > rt_hash_mask)
856 goal = rt_hash_mask + 1;
857 for (; goal > 0; goal--) {
858 unsigned long tmo = ip_rt_gc_timeout;
859 unsigned long length;
860
861 i = (i + 1) & rt_hash_mask;
862 rthp = &rt_hash_table[i].chain;
863
864 if (need_resched())
865 cond_resched();
866
867 samples++;
868
869 if (rcu_dereference_raw(*rthp) == NULL)
870 continue;
871 length = 0;
872 spin_lock_bh(rt_hash_lock_addr(i));
873 while ((rth = rcu_dereference_protected(*rthp,
874 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
875 prefetch(rth->dst.rt_next);
876 if (rt_is_expired(rth)) {
877 *rthp = rth->dst.rt_next;
878 rt_free(rth);
879 continue;
880 }
881 if (rth->dst.expires) {
882 /* Entry is expired even if it is in use */
883 if (time_before_eq(jiffies, rth->dst.expires)) {
884nofree:
885 tmo >>= 1;
886 rthp = &rth->dst.rt_next;
887 /*
888 * We only count entries on
889 * a chain with equal hash inputs once
890 * so that entries for different QOS
891 * levels, and other non-hash input
892 * attributes don't unfairly skew
893 * the length computation
894 */
895 length += has_noalias(rt_hash_table[i].chain, rth);
896 continue;
897 }
898 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
899 goto nofree;
900
901 /* Cleanup aged off entries. */
902 *rthp = rth->dst.rt_next;
903 rt_free(rth);
904 }
905 spin_unlock_bh(rt_hash_lock_addr(i));
906 sum += length;
907 sum2 += length*length;
908 }
909 if (samples) {
910 unsigned long avg = sum / samples;
911 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
912 rt_chain_length_max = max_t(unsigned long,
913 ip_rt_gc_elasticity,
914 (avg + 4*sd) >> FRACT_BITS);
915 }
916 rover = i;
917}
534 918
535 __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0); 919/*
920 * rt_worker_func() is run in process context.
921 * we call rt_check_expire() to scan part of the hash table
922 */
923static void rt_worker_func(struct work_struct *work)
924{
925 rt_check_expire();
926 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
536} 927}
537 928
538static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) 929/*
930 * Perturbation of rt_genid by a small quantity [1..256]
931 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
932 * many times (2^24) without giving recent rt_genid.
933 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
934 */
935static void rt_cache_invalidate(struct net *net)
539{ 936{
540 const struct inet_sock *inet = inet_sk(sk); 937 unsigned char shuffle;
541 const struct ip_options_rcu *inet_opt;
542 __be32 daddr = inet->inet_daddr;
543 938
544 rcu_read_lock(); 939 get_random_bytes(&shuffle, sizeof(shuffle));
545 inet_opt = rcu_dereference(inet->inet_opt); 940 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
546 if (inet_opt && inet_opt->opt.srr) 941 redirect_genid++;
547 daddr = inet_opt->opt.faddr; 942}
548 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, 943
549 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, 944/*
550 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, 945 * delay < 0 : invalidate cache (fast : entries will be deleted later)
551 inet_sk_flowi_flags(sk), 946 * delay >= 0 : invalidate & flush cache (can be long)
552 daddr, inet->inet_saddr, 0, 0); 947 */
553 rcu_read_unlock(); 948void rt_cache_flush(struct net *net, int delay)
949{
950 rt_cache_invalidate(net);
951 if (delay >= 0)
952 rt_do_flush(net, !in_softirq());
554} 953}
555 954
556static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, 955/* Flush previous cache invalidated entries from the cache */
557 const struct sk_buff *skb) 956void rt_cache_flush_batch(struct net *net)
558{ 957{
559 if (skb) 958 rt_do_flush(net, !in_softirq());
560 build_skb_flow_key(fl4, skb, sk);
561 else
562 build_sk_flow_key(fl4, sk);
563} 959}
564 960
565static inline void rt_free(struct rtable *rt) 961static void rt_emergency_hash_rebuild(struct net *net)
566{ 962{
567 call_rcu(&rt->dst.rcu_head, dst_rcu_free); 963 if (net_ratelimit())
964 printk(KERN_WARNING "Route hash chain too long!\n");
965 rt_cache_invalidate(net);
568} 966}
569 967
570static DEFINE_SPINLOCK(fnhe_lock); 968/*
969 Short description of GC goals.
970
971 We want to build algorithm, which will keep routing cache
972 at some equilibrium point, when number of aged off entries
973 is kept approximately equal to newly generated ones.
974
975 Current expiration strength is variable "expire".
976 We try to adjust it dynamically, so that if networking
977 is idle expires is large enough to keep enough of warm entries,
978 and when load increases it reduces to limit cache size.
979 */
571 980
572static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) 981static int rt_garbage_collect(struct dst_ops *ops)
573{ 982{
574 struct fib_nh_exception *fnhe, *oldest; 983 static unsigned long expire = RT_GC_TIMEOUT;
575 struct rtable *orig; 984 static unsigned long last_gc;
985 static int rover;
986 static int equilibrium;
987 struct rtable *rth;
988 struct rtable __rcu **rthp;
989 unsigned long now = jiffies;
990 int goal;
991 int entries = dst_entries_get_fast(&ipv4_dst_ops);
992
993 /*
994 * Garbage collection is pretty expensive,
995 * do not make it too frequently.
996 */
997
998 RT_CACHE_STAT_INC(gc_total);
999
1000 if (now - last_gc < ip_rt_gc_min_interval &&
1001 entries < ip_rt_max_size) {
1002 RT_CACHE_STAT_INC(gc_ignored);
1003 goto out;
1004 }
1005
1006 entries = dst_entries_get_slow(&ipv4_dst_ops);
1007 /* Calculate number of entries, which we want to expire now. */
1008 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1009 if (goal <= 0) {
1010 if (equilibrium < ipv4_dst_ops.gc_thresh)
1011 equilibrium = ipv4_dst_ops.gc_thresh;
1012 goal = entries - equilibrium;
1013 if (goal > 0) {
1014 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1015 goal = entries - equilibrium;
1016 }
1017 } else {
1018 /* We are in dangerous area. Try to reduce cache really
1019 * aggressively.
1020 */
1021 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1022 equilibrium = entries - goal;
1023 }
576 1024
577 oldest = rcu_dereference(hash->chain); 1025 if (now - last_gc >= ip_rt_gc_min_interval)
578 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe; 1026 last_gc = now;
579 fnhe = rcu_dereference(fnhe->fnhe_next)) { 1027
580 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) 1028 if (goal <= 0) {
581 oldest = fnhe; 1029 equilibrium += goal;
1030 goto work_done;
582 } 1031 }
583 orig = rcu_dereference(oldest->fnhe_rth); 1032
584 if (orig) { 1033 do {
585 RCU_INIT_POINTER(oldest->fnhe_rth, NULL); 1034 int i, k;
586 rt_free(orig); 1035
1036 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1037 unsigned long tmo = expire;
1038
1039 k = (k + 1) & rt_hash_mask;
1040 rthp = &rt_hash_table[k].chain;
1041 spin_lock_bh(rt_hash_lock_addr(k));
1042 while ((rth = rcu_dereference_protected(*rthp,
1043 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1044 if (!rt_is_expired(rth) &&
1045 !rt_may_expire(rth, tmo, expire)) {
1046 tmo >>= 1;
1047 rthp = &rth->dst.rt_next;
1048 continue;
1049 }
1050 *rthp = rth->dst.rt_next;
1051 rt_free(rth);
1052 goal--;
1053 }
1054 spin_unlock_bh(rt_hash_lock_addr(k));
1055 if (goal <= 0)
1056 break;
1057 }
1058 rover = k;
1059
1060 if (goal <= 0)
1061 goto work_done;
1062
1063 /* Goal is not achieved. We stop process if:
1064
1065 - if expire reduced to zero. Otherwise, expire is halfed.
1066 - if table is not full.
1067 - if we are called from interrupt.
1068 - jiffies check is just fallback/debug loop breaker.
1069 We will not spin here for long time in any case.
1070 */
1071
1072 RT_CACHE_STAT_INC(gc_goal_miss);
1073
1074 if (expire == 0)
1075 break;
1076
1077 expire >>= 1;
1078
1079 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1080 goto out;
1081 } while (!in_softirq() && time_before_eq(jiffies, now));
1082
1083 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1084 goto out;
1085 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1086 goto out;
1087 if (net_ratelimit())
1088 printk(KERN_WARNING "dst cache overflow\n");
1089 RT_CACHE_STAT_INC(gc_dst_overflow);
1090 return 1;
1091
1092work_done:
1093 expire += ip_rt_gc_min_interval;
1094 if (expire > ip_rt_gc_timeout ||
1095 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1096 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1097 expire = ip_rt_gc_timeout;
1098out: return 0;
1099}
1100
1101/*
1102 * Returns number of entries in a hash chain that have different hash_inputs
1103 */
1104static int slow_chain_length(const struct rtable *head)
1105{
1106 int length = 0;
1107 const struct rtable *rth = head;
1108
1109 while (rth) {
1110 length += has_noalias(head, rth);
1111 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
587 } 1112 }
588 return oldest; 1113 return length >> FRACT_BITS;
589} 1114}
590 1115
591static inline u32 fnhe_hashfun(__be32 daddr) 1116static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
592{ 1117{
593 u32 hval; 1118 struct neigh_table *tbl = &arp_tbl;
1119 static const __be32 inaddr_any = 0;
1120 struct net_device *dev = dst->dev;
1121 const __be32 *pkey = daddr;
1122 struct neighbour *n;
594 1123
595 hval = (__force u32) daddr; 1124#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
596 hval ^= (hval >> 11) ^ (hval >> 22); 1125 if (dev->type == ARPHRD_ATM)
1126 tbl = clip_tbl_hook;
1127#endif
1128 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1129 pkey = &inaddr_any;
597 1130
598 return hval & (FNHE_HASH_SIZE - 1); 1131 n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1132 if (n)
1133 return n;
1134 return neigh_create(tbl, pkey, dev);
599} 1135}
600 1136
601static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, 1137static int rt_bind_neighbour(struct rtable *rt)
602 u32 pmtu, unsigned long expires)
603{ 1138{
604 struct fnhe_hash_bucket *hash; 1139 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
605 struct fib_nh_exception *fnhe; 1140 if (IS_ERR(n))
606 int depth; 1141 return PTR_ERR(n);
607 u32 hval = fnhe_hashfun(daddr); 1142 dst_set_neighbour(&rt->dst, n);
608 1143
609 spin_lock_bh(&fnhe_lock); 1144 return 0;
1145}
1146
1147static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1148 struct sk_buff *skb, int ifindex)
1149{
1150 struct rtable *rth, *cand;
1151 struct rtable __rcu **rthp, **candp;
1152 unsigned long now;
1153 u32 min_score;
1154 int chain_length;
1155 int attempts = !in_softirq();
610 1156
611 hash = nh->nh_exceptions; 1157restart:
612 if (!hash) { 1158 chain_length = 0;
613 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC); 1159 min_score = ~(u32)0;
614 if (!hash) 1160 cand = NULL;
615 goto out_unlock; 1161 candp = NULL;
616 nh->nh_exceptions = hash; 1162 now = jiffies;
1163
1164 if (!rt_caching(dev_net(rt->dst.dev))) {
1165 /*
1166 * If we're not caching, just tell the caller we
1167 * were successful and don't touch the route. The
1168 * caller hold the sole reference to the cache entry, and
1169 * it will be released when the caller is done with it.
1170 * If we drop it here, the callers have no way to resolve routes
1171 * when we're not caching. Instead, just point *rp at rt, so
1172 * the caller gets a single use out of the route
1173 * Note that we do rt_free on this new route entry, so that
1174 * once its refcount hits zero, we are still able to reap it
1175 * (Thanks Alexey)
1176 * Note: To avoid expensive rcu stuff for this uncached dst,
1177 * we set DST_NOCACHE so that dst_release() can free dst without
1178 * waiting a grace period.
1179 */
1180
1181 rt->dst.flags |= DST_NOCACHE;
1182 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1183 int err = rt_bind_neighbour(rt);
1184 if (err) {
1185 if (net_ratelimit())
1186 printk(KERN_WARNING
1187 "Neighbour table failure & not caching routes.\n");
1188 ip_rt_put(rt);
1189 return ERR_PTR(err);
1190 }
1191 }
1192
1193 goto skip_hashing;
617 } 1194 }
618 1195
619 hash += hval; 1196 rthp = &rt_hash_table[hash].chain;
620 1197
621 depth = 0; 1198 spin_lock_bh(rt_hash_lock_addr(hash));
622 for (fnhe = rcu_dereference(hash->chain); fnhe; 1199 while ((rth = rcu_dereference_protected(*rthp,
623 fnhe = rcu_dereference(fnhe->fnhe_next)) { 1200 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
624 if (fnhe->fnhe_daddr == daddr) 1201 if (rt_is_expired(rth)) {
625 break; 1202 *rthp = rth->dst.rt_next;
626 depth++; 1203 rt_free(rth);
1204 continue;
1205 }
1206 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1207 /* Put it first */
1208 *rthp = rth->dst.rt_next;
1209 /*
1210 * Since lookup is lockfree, the deletion
1211 * must be visible to another weakly ordered CPU before
1212 * the insertion at the start of the hash chain.
1213 */
1214 rcu_assign_pointer(rth->dst.rt_next,
1215 rt_hash_table[hash].chain);
1216 /*
1217 * Since lookup is lockfree, the update writes
1218 * must be ordered for consistency on SMP.
1219 */
1220 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1221
1222 dst_use(&rth->dst, now);
1223 spin_unlock_bh(rt_hash_lock_addr(hash));
1224
1225 rt_drop(rt);
1226 if (skb)
1227 skb_dst_set(skb, &rth->dst);
1228 return rth;
1229 }
1230
1231 if (!atomic_read(&rth->dst.__refcnt)) {
1232 u32 score = rt_score(rth);
1233
1234 if (score <= min_score) {
1235 cand = rth;
1236 candp = rthp;
1237 min_score = score;
1238 }
1239 }
1240
1241 chain_length++;
1242
1243 rthp = &rth->dst.rt_next;
627 } 1244 }
628 1245
629 if (fnhe) { 1246 if (cand) {
630 if (gw) 1247 /* ip_rt_gc_elasticity used to be average length of chain
631 fnhe->fnhe_gw = gw; 1248 * length, when exceeded gc becomes really aggressive.
632 if (pmtu) { 1249 *
633 fnhe->fnhe_pmtu = pmtu; 1250 * The second limit is less certain. At the moment it allows
634 fnhe->fnhe_expires = expires; 1251 * only 2 entries per bucket. We will see.
1252 */
1253 if (chain_length > ip_rt_gc_elasticity) {
1254 *candp = cand->dst.rt_next;
1255 rt_free(cand);
635 } 1256 }
636 } else { 1257 } else {
637 if (depth > FNHE_RECLAIM_DEPTH) 1258 if (chain_length > rt_chain_length_max &&
638 fnhe = fnhe_oldest(hash); 1259 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
639 else { 1260 struct net *net = dev_net(rt->dst.dev);
640 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); 1261 int num = ++net->ipv4.current_rt_cache_rebuild_count;
641 if (!fnhe) 1262 if (!rt_caching(net)) {
642 goto out_unlock; 1263 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
643 1264 rt->dst.dev->name, num);
644 fnhe->fnhe_next = hash->chain; 1265 }
645 rcu_assign_pointer(hash->chain, fnhe); 1266 rt_emergency_hash_rebuild(net);
1267 spin_unlock_bh(rt_hash_lock_addr(hash));
1268
1269 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1270 ifindex, rt_genid(net));
1271 goto restart;
646 } 1272 }
647 fnhe->fnhe_daddr = daddr;
648 fnhe->fnhe_gw = gw;
649 fnhe->fnhe_pmtu = pmtu;
650 fnhe->fnhe_expires = expires;
651 } 1273 }
652 1274
653 fnhe->fnhe_stamp = jiffies; 1275 /* Try to bind route to arp only if it is output
1276 route or unicast forwarding path.
1277 */
1278 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1279 int err = rt_bind_neighbour(rt);
1280 if (err) {
1281 spin_unlock_bh(rt_hash_lock_addr(hash));
1282
1283 if (err != -ENOBUFS) {
1284 rt_drop(rt);
1285 return ERR_PTR(err);
1286 }
654 1287
655out_unlock: 1288 /* Neighbour tables are full and nothing
656 spin_unlock_bh(&fnhe_lock); 1289 can be released. Try to shrink route cache,
657 return; 1290 it is most likely it holds some neighbour records.
1291 */
1292 if (attempts-- > 0) {
1293 int saved_elasticity = ip_rt_gc_elasticity;
1294 int saved_int = ip_rt_gc_min_interval;
1295 ip_rt_gc_elasticity = 1;
1296 ip_rt_gc_min_interval = 0;
1297 rt_garbage_collect(&ipv4_dst_ops);
1298 ip_rt_gc_min_interval = saved_int;
1299 ip_rt_gc_elasticity = saved_elasticity;
1300 goto restart;
1301 }
1302
1303 if (net_ratelimit())
1304 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1305 rt_drop(rt);
1306 return ERR_PTR(-ENOBUFS);
1307 }
1308 }
1309
1310 rt->dst.rt_next = rt_hash_table[hash].chain;
1311
1312 /*
1313 * Since lookup is lockfree, we must make sure
1314 * previous writes to rt are committed to memory
1315 * before making rt visible to other CPUS.
1316 */
1317 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1318
1319 spin_unlock_bh(rt_hash_lock_addr(hash));
1320
1321skip_hashing:
1322 if (skb)
1323 skb_dst_set(skb, &rt->dst);
1324 return rt;
658} 1325}
659 1326
660static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, 1327static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
661 bool kill_route) 1328
1329static u32 rt_peer_genid(void)
662{ 1330{
663 __be32 new_gw = icmp_hdr(skb)->un.gateway; 1331 return atomic_read(&__rt_peer_genid);
664 __be32 old_gw = ip_hdr(skb)->saddr; 1332}
665 struct net_device *dev = skb->dev;
666 struct in_device *in_dev;
667 struct fib_result res;
668 struct neighbour *n;
669 struct net *net;
670 1333
671 switch (icmp_hdr(skb)->code & 7) { 1334void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
672 case ICMP_REDIR_NET: 1335{
673 case ICMP_REDIR_NETTOS: 1336 struct inet_peer *peer;
674 case ICMP_REDIR_HOST:
675 case ICMP_REDIR_HOSTTOS:
676 break;
677 1337
678 default: 1338 peer = inet_getpeer_v4(daddr, create);
679 return; 1339
1340 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1341 inet_putpeer(peer);
1342 else
1343 rt->rt_peer_genid = rt_peer_genid();
1344}
1345
1346/*
1347 * Peer allocation may fail only in serious out-of-memory conditions. However
1348 * we still can generate some output.
1349 * Random ID selection looks a bit dangerous because we have no chances to
1350 * select ID being unique in a reasonable period of time.
1351 * But broken packet identifier may be better than no packet at all.
1352 */
1353static void ip_select_fb_ident(struct iphdr *iph)
1354{
1355 static DEFINE_SPINLOCK(ip_fb_id_lock);
1356 static u32 ip_fallback_id;
1357 u32 salt;
1358
1359 spin_lock_bh(&ip_fb_id_lock);
1360 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1361 iph->id = htons(salt & 0xFFFF);
1362 ip_fallback_id = salt;
1363 spin_unlock_bh(&ip_fb_id_lock);
1364}
1365
1366void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1367{
1368 struct rtable *rt = (struct rtable *) dst;
1369
1370 if (rt && !(rt->dst.flags & DST_NOPEER)) {
1371 if (rt->peer == NULL)
1372 rt_bind_peer(rt, rt->rt_dst, 1);
1373
1374 /* If peer is attached to destination, it is never detached,
1375 so that we need not to grab a lock to dereference it.
1376 */
1377 if (rt->peer) {
1378 iph->id = htons(inet_getid(rt->peer, more));
1379 return;
1380 }
1381 } else if (!rt)
1382 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1383 __builtin_return_address(0));
1384
1385 ip_select_fb_ident(iph);
1386}
1387EXPORT_SYMBOL(__ip_select_ident);
1388
1389static void rt_del(unsigned hash, struct rtable *rt)
1390{
1391 struct rtable __rcu **rthp;
1392 struct rtable *aux;
1393
1394 rthp = &rt_hash_table[hash].chain;
1395 spin_lock_bh(rt_hash_lock_addr(hash));
1396 ip_rt_put(rt);
1397 while ((aux = rcu_dereference_protected(*rthp,
1398 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1399 if (aux == rt || rt_is_expired(aux)) {
1400 *rthp = aux->dst.rt_next;
1401 rt_free(aux);
1402 continue;
1403 }
1404 rthp = &aux->dst.rt_next;
680 } 1405 }
1406 spin_unlock_bh(rt_hash_lock_addr(hash));
1407}
1408
1409static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1410{
1411 struct rtable *rt = (struct rtable *) dst;
1412 __be32 orig_gw = rt->rt_gateway;
1413 struct neighbour *n, *old_n;
1414
1415 dst_confirm(&rt->dst);
681 1416
682 if (rt->rt_gateway != old_gw) 1417 rt->rt_gateway = peer->redirect_learned.a4;
1418
1419 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1420 if (IS_ERR(n)) {
1421 rt->rt_gateway = orig_gw;
683 return; 1422 return;
1423 }
1424 old_n = xchg(&rt->dst._neighbour, n);
1425 if (old_n)
1426 neigh_release(old_n);
1427 if (!(n->nud_state & NUD_VALID)) {
1428 neigh_event_send(n, NULL);
1429 } else {
1430 rt->rt_flags |= RTCF_REDIRECTED;
1431 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1432 }
1433}
1434
1435/* called in rcu_read_lock() section */
1436void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1437 __be32 saddr, struct net_device *dev)
1438{
1439 int s, i;
1440 struct in_device *in_dev = __in_dev_get_rcu(dev);
1441 __be32 skeys[2] = { saddr, 0 };
1442 int ikeys[2] = { dev->ifindex, 0 };
1443 struct inet_peer *peer;
1444 struct net *net;
684 1445
685 in_dev = __in_dev_get_rcu(dev);
686 if (!in_dev) 1446 if (!in_dev)
687 return; 1447 return;
688 1448
@@ -702,50 +1462,74 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
702 goto reject_redirect; 1462 goto reject_redirect;
703 } 1463 }
704 1464
705 n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw); 1465 for (s = 0; s < 2; s++) {
706 if (n) { 1466 for (i = 0; i < 2; i++) {
707 if (!(n->nud_state & NUD_VALID)) { 1467 unsigned int hash;
708 neigh_event_send(n, NULL); 1468 struct rtable __rcu **rthp;
709 } else { 1469 struct rtable *rt;
710 if (fib_lookup(net, fl4, &res) == 0) { 1470
711 struct fib_nh *nh = &FIB_RES_NH(res); 1471 hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
712 1472
713 update_or_create_fnhe(nh, fl4->daddr, new_gw, 1473 rthp = &rt_hash_table[hash].chain;
714 0, 0); 1474
1475 while ((rt = rcu_dereference(*rthp)) != NULL) {
1476 rthp = &rt->dst.rt_next;
1477
1478 if (rt->rt_key_dst != daddr ||
1479 rt->rt_key_src != skeys[s] ||
1480 rt->rt_oif != ikeys[i] ||
1481 rt_is_input_route(rt) ||
1482 rt_is_expired(rt) ||
1483 !net_eq(dev_net(rt->dst.dev), net) ||
1484 rt->dst.error ||
1485 rt->dst.dev != dev ||
1486 rt->rt_gateway != old_gw)
1487 continue;
1488
1489 if (!rt->peer)
1490 rt_bind_peer(rt, rt->rt_dst, 1);
1491
1492 peer = rt->peer;
1493 if (peer) {
1494 if (peer->redirect_learned.a4 != new_gw ||
1495 peer->redirect_genid != redirect_genid) {
1496 peer->redirect_learned.a4 = new_gw;
1497 peer->redirect_genid = redirect_genid;
1498 atomic_inc(&__rt_peer_genid);
1499 }
1500 check_peer_redir(&rt->dst, peer);
1501 }
715 } 1502 }
716 if (kill_route)
717 rt->dst.obsolete = DST_OBSOLETE_KILL;
718 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
719 } 1503 }
720 neigh_release(n);
721 } 1504 }
722 return; 1505 return;
723 1506
724reject_redirect: 1507reject_redirect:
725#ifdef CONFIG_IP_ROUTE_VERBOSE 1508#ifdef CONFIG_IP_ROUTE_VERBOSE
726 if (IN_DEV_LOG_MARTIANS(in_dev)) { 1509 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
727 const struct iphdr *iph = (const struct iphdr *) skb->data; 1510 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
728 __be32 daddr = iph->daddr; 1511 " Advised path = %pI4 -> %pI4\n",
729 __be32 saddr = iph->saddr; 1512 &old_gw, dev->name, &new_gw,
730 1513 &saddr, &daddr);
731 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
732 " Advised path = %pI4 -> %pI4\n",
733 &old_gw, dev->name, &new_gw,
734 &saddr, &daddr);
735 }
736#endif 1514#endif
737 ; 1515 ;
738} 1516}
739 1517
740static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 1518static bool peer_pmtu_expired(struct inet_peer *peer)
741{ 1519{
742 struct rtable *rt; 1520 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
743 struct flowi4 fl4;
744 1521
745 rt = (struct rtable *) dst; 1522 return orig &&
1523 time_after_eq(jiffies, orig) &&
1524 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1525}
1526
1527static bool peer_pmtu_cleaned(struct inet_peer *peer)
1528{
1529 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
746 1530
747 ip_rt_build_flow_key(&fl4, sk, skb); 1531 return orig &&
748 __ip_do_redirect(rt, skb, &fl4, true); 1532 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
749} 1533}
750 1534
751static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) 1535static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
@@ -757,10 +1541,14 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
757 if (dst->obsolete > 0) { 1541 if (dst->obsolete > 0) {
758 ip_rt_put(rt); 1542 ip_rt_put(rt);
759 ret = NULL; 1543 ret = NULL;
760 } else if ((rt->rt_flags & RTCF_REDIRECTED) || 1544 } else if (rt->rt_flags & RTCF_REDIRECTED) {
761 rt->dst.expires) { 1545 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
762 ip_rt_put(rt); 1546 rt->rt_oif,
1547 rt_genid(dev_net(dst->dev)));
1548 rt_del(hash, rt);
763 ret = NULL; 1549 ret = NULL;
1550 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1551 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
764 } 1552 }
765 } 1553 }
766 return ret; 1554 return ret;
@@ -787,7 +1575,6 @@ void ip_rt_send_redirect(struct sk_buff *skb)
787 struct rtable *rt = skb_rtable(skb); 1575 struct rtable *rt = skb_rtable(skb);
788 struct in_device *in_dev; 1576 struct in_device *in_dev;
789 struct inet_peer *peer; 1577 struct inet_peer *peer;
790 struct net *net;
791 int log_martians; 1578 int log_martians;
792 1579
793 rcu_read_lock(); 1580 rcu_read_lock();
@@ -799,11 +1586,11 @@ void ip_rt_send_redirect(struct sk_buff *skb)
799 log_martians = IN_DEV_LOG_MARTIANS(in_dev); 1586 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
800 rcu_read_unlock(); 1587 rcu_read_unlock();
801 1588
802 net = dev_net(rt->dst.dev); 1589 if (!rt->peer)
803 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); 1590 rt_bind_peer(rt, rt->rt_dst, 1);
1591 peer = rt->peer;
804 if (!peer) { 1592 if (!peer) {
805 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, 1593 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
806 rt_nexthop(rt, ip_hdr(skb)->daddr));
807 return; 1594 return;
808 } 1595 }
809 1596
@@ -818,7 +1605,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
818 */ 1605 */
819 if (peer->rate_tokens >= ip_rt_redirect_number) { 1606 if (peer->rate_tokens >= ip_rt_redirect_number) {
820 peer->rate_last = jiffies; 1607 peer->rate_last = jiffies;
821 goto out_put_peer; 1608 return;
822 } 1609 }
823 1610
824 /* Check for load limit; set rate_last to the latest sent 1611 /* Check for load limit; set rate_last to the latest sent
@@ -828,47 +1615,28 @@ void ip_rt_send_redirect(struct sk_buff *skb)
828 time_after(jiffies, 1615 time_after(jiffies,
829 (peer->rate_last + 1616 (peer->rate_last +
830 (ip_rt_redirect_load << peer->rate_tokens)))) { 1617 (ip_rt_redirect_load << peer->rate_tokens)))) {
831 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr); 1618 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
832
833 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
834 peer->rate_last = jiffies; 1619 peer->rate_last = jiffies;
835 ++peer->rate_tokens; 1620 ++peer->rate_tokens;
836#ifdef CONFIG_IP_ROUTE_VERBOSE 1621#ifdef CONFIG_IP_ROUTE_VERBOSE
837 if (log_martians && 1622 if (log_martians &&
838 peer->rate_tokens == ip_rt_redirect_number) 1623 peer->rate_tokens == ip_rt_redirect_number &&
839 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", 1624 net_ratelimit())
840 &ip_hdr(skb)->saddr, inet_iif(skb), 1625 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
841 &ip_hdr(skb)->daddr, &gw); 1626 &ip_hdr(skb)->saddr, rt->rt_iif,
1627 &rt->rt_dst, &rt->rt_gateway);
842#endif 1628#endif
843 } 1629 }
844out_put_peer:
845 inet_putpeer(peer);
846} 1630}
847 1631
848static int ip_error(struct sk_buff *skb) 1632static int ip_error(struct sk_buff *skb)
849{ 1633{
850 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
851 struct rtable *rt = skb_rtable(skb); 1634 struct rtable *rt = skb_rtable(skb);
852 struct inet_peer *peer; 1635 struct inet_peer *peer;
853 unsigned long now; 1636 unsigned long now;
854 struct net *net;
855 bool send; 1637 bool send;
856 int code; 1638 int code;
857 1639
858 net = dev_net(rt->dst.dev);
859 if (!IN_DEV_FORWARD(in_dev)) {
860 switch (rt->dst.error) {
861 case EHOSTUNREACH:
862 IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
863 break;
864
865 case ENETUNREACH:
866 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
867 break;
868 }
869 goto out;
870 }
871
872 switch (rt->dst.error) { 1640 switch (rt->dst.error) {
873 case EINVAL: 1641 case EINVAL:
874 default: 1642 default:
@@ -878,14 +1646,17 @@ static int ip_error(struct sk_buff *skb)
878 break; 1646 break;
879 case ENETUNREACH: 1647 case ENETUNREACH:
880 code = ICMP_NET_UNREACH; 1648 code = ICMP_NET_UNREACH;
881 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES); 1649 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1650 IPSTATS_MIB_INNOROUTES);
882 break; 1651 break;
883 case EACCES: 1652 case EACCES:
884 code = ICMP_PKT_FILTERED; 1653 code = ICMP_PKT_FILTERED;
885 break; 1654 break;
886 } 1655 }
887 1656
888 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); 1657 if (!rt->peer)
1658 rt_bind_peer(rt, rt->rt_dst, 1);
1659 peer = rt->peer;
889 1660
890 send = true; 1661 send = true;
891 if (peer) { 1662 if (peer) {
@@ -898,7 +1669,6 @@ static int ip_error(struct sk_buff *skb)
898 peer->rate_tokens -= ip_rt_error_cost; 1669 peer->rate_tokens -= ip_rt_error_cost;
899 else 1670 else
900 send = false; 1671 send = false;
901 inet_putpeer(peer);
902 } 1672 }
903 if (send) 1673 if (send)
904 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 1674 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
@@ -907,125 +1677,165 @@ out: kfree_skb(skb);
907 return 0; 1677 return 0;
908} 1678}
909 1679
910static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) 1680/*
1681 * The last two values are not from the RFC but
1682 * are needed for AMPRnet AX.25 paths.
1683 */
1684
1685static const unsigned short mtu_plateau[] =
1686{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1687
1688static inline unsigned short guess_mtu(unsigned short old_mtu)
911{ 1689{
912 struct dst_entry *dst = &rt->dst; 1690 int i;
913 struct fib_result res;
914 1691
915 if (dst->dev->mtu < mtu) 1692 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
916 return; 1693 if (old_mtu > mtu_plateau[i])
1694 return mtu_plateau[i];
1695 return 68;
1696}
917 1697
918 if (mtu < ip_rt_min_pmtu) 1698unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
919 mtu = ip_rt_min_pmtu; 1699 unsigned short new_mtu,
1700 struct net_device *dev)
1701{
1702 unsigned short old_mtu = ntohs(iph->tot_len);
1703 unsigned short est_mtu = 0;
1704 struct inet_peer *peer;
920 1705
921 if (!rt->rt_pmtu) { 1706 peer = inet_getpeer_v4(iph->daddr, 1);
922 dst->obsolete = DST_OBSOLETE_KILL; 1707 if (peer) {
923 } else { 1708 unsigned short mtu = new_mtu;
924 rt->rt_pmtu = mtu;
925 dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
926 }
927 1709
928 rcu_read_lock(); 1710 if (new_mtu < 68 || new_mtu >= old_mtu) {
929 if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) { 1711 /* BSD 4.2 derived systems incorrectly adjust
930 struct fib_nh *nh = &FIB_RES_NH(res); 1712 * tot_len by the IP header length, and report
1713 * a zero MTU in the ICMP message.
1714 */
1715 if (mtu == 0 &&
1716 old_mtu >= 68 + (iph->ihl << 2))
1717 old_mtu -= iph->ihl << 2;
1718 mtu = guess_mtu(old_mtu);
1719 }
1720
1721 if (mtu < ip_rt_min_pmtu)
1722 mtu = ip_rt_min_pmtu;
1723 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1724 unsigned long pmtu_expires;
931 1725
932 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, 1726 pmtu_expires = jiffies + ip_rt_mtu_expires;
933 jiffies + ip_rt_mtu_expires); 1727 if (!pmtu_expires)
1728 pmtu_expires = 1UL;
1729
1730 est_mtu = mtu;
1731 peer->pmtu_learned = mtu;
1732 peer->pmtu_expires = pmtu_expires;
1733 atomic_inc(&__rt_peer_genid);
1734 }
1735
1736 inet_putpeer(peer);
934 } 1737 }
935 rcu_read_unlock(); 1738 return est_mtu ? : new_mtu;
936} 1739}
937 1740
938static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 1741static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
939 struct sk_buff *skb, u32 mtu)
940{ 1742{
941 struct rtable *rt = (struct rtable *) dst; 1743 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
942 struct flowi4 fl4;
943 1744
944 ip_rt_build_flow_key(&fl4, sk, skb); 1745 if (!expires)
945 __ip_rt_update_pmtu(rt, &fl4, mtu); 1746 return;
1747 if (time_before(jiffies, expires)) {
1748 u32 orig_dst_mtu = dst_mtu(dst);
1749 if (peer->pmtu_learned < orig_dst_mtu) {
1750 if (!peer->pmtu_orig)
1751 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1752 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1753 }
1754 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1755 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
946} 1756}
947 1757
948void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, 1758static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
949 int oif, u32 mark, u8 protocol, int flow_flags)
950{ 1759{
951 const struct iphdr *iph = (const struct iphdr *) skb->data; 1760 struct rtable *rt = (struct rtable *) dst;
952 struct flowi4 fl4; 1761 struct inet_peer *peer;
953 struct rtable *rt;
954 1762
955 __build_flow_key(&fl4, NULL, iph, oif, 1763 dst_confirm(dst);
956 RT_TOS(iph->tos), protocol, mark, flow_flags);
957 rt = __ip_route_output_key(net, &fl4);
958 if (!IS_ERR(rt)) {
959 __ip_rt_update_pmtu(rt, &fl4, mtu);
960 ip_rt_put(rt);
961 }
962}
963EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
964 1764
965void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) 1765 if (!rt->peer)
966{ 1766 rt_bind_peer(rt, rt->rt_dst, 1);
967 const struct iphdr *iph = (const struct iphdr *) skb->data; 1767 peer = rt->peer;
968 struct flowi4 fl4; 1768 if (peer) {
969 struct rtable *rt; 1769 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
970 1770
971 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); 1771 if (mtu < ip_rt_min_pmtu)
972 rt = __ip_route_output_key(sock_net(sk), &fl4); 1772 mtu = ip_rt_min_pmtu;
973 if (!IS_ERR(rt)) { 1773 if (!pmtu_expires || mtu < peer->pmtu_learned) {
974 __ip_rt_update_pmtu(rt, &fl4, mtu);
975 ip_rt_put(rt);
976 }
977}
978EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
979 1774
980void ipv4_redirect(struct sk_buff *skb, struct net *net, 1775 pmtu_expires = jiffies + ip_rt_mtu_expires;
981 int oif, u32 mark, u8 protocol, int flow_flags) 1776 if (!pmtu_expires)
982{ 1777 pmtu_expires = 1UL;
983 const struct iphdr *iph = (const struct iphdr *) skb->data;
984 struct flowi4 fl4;
985 struct rtable *rt;
986 1778
987 __build_flow_key(&fl4, NULL, iph, oif, 1779 peer->pmtu_learned = mtu;
988 RT_TOS(iph->tos), protocol, mark, flow_flags); 1780 peer->pmtu_expires = pmtu_expires;
989 rt = __ip_route_output_key(net, &fl4); 1781
990 if (!IS_ERR(rt)) { 1782 atomic_inc(&__rt_peer_genid);
991 __ip_do_redirect(rt, skb, &fl4, false); 1783 rt->rt_peer_genid = rt_peer_genid();
992 ip_rt_put(rt); 1784 }
1785 check_peer_pmtu(dst, peer);
993 } 1786 }
994} 1787}
995EXPORT_SYMBOL_GPL(ipv4_redirect);
996 1788
997void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) 1789
1790static void ipv4_validate_peer(struct rtable *rt)
998{ 1791{
999 const struct iphdr *iph = (const struct iphdr *) skb->data; 1792 if (rt->rt_peer_genid != rt_peer_genid()) {
1000 struct flowi4 fl4; 1793 struct inet_peer *peer;
1001 struct rtable *rt; 1794
1795 if (!rt->peer)
1796 rt_bind_peer(rt, rt->rt_dst, 0);
1002 1797
1003 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); 1798 peer = rt->peer;
1004 rt = __ip_route_output_key(sock_net(sk), &fl4); 1799 if (peer) {
1005 if (!IS_ERR(rt)) { 1800 check_peer_pmtu(&rt->dst, peer);
1006 __ip_do_redirect(rt, skb, &fl4, false); 1801
1007 ip_rt_put(rt); 1802 if (peer->redirect_genid != redirect_genid)
1803 peer->redirect_learned.a4 = 0;
1804 if (peer->redirect_learned.a4 &&
1805 peer->redirect_learned.a4 != rt->rt_gateway)
1806 check_peer_redir(&rt->dst, peer);
1807 }
1808
1809 rt->rt_peer_genid = rt_peer_genid();
1008 } 1810 }
1009} 1811}
1010EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1011 1812
1012static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 1813static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1013{ 1814{
1014 struct rtable *rt = (struct rtable *) dst; 1815 struct rtable *rt = (struct rtable *) dst;
1015 1816
1016 /* All IPV4 dsts are created with ->obsolete set to the value 1817 if (rt_is_expired(rt))
1017 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1018 * into this function always.
1019 *
1020 * When a PMTU/redirect information update invalidates a
1021 * route, this is indicated by setting obsolete to
1022 * DST_OBSOLETE_KILL.
1023 */
1024 if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1025 return NULL; 1818 return NULL;
1819 ipv4_validate_peer(rt);
1026 return dst; 1820 return dst;
1027} 1821}
1028 1822
1823static void ipv4_dst_destroy(struct dst_entry *dst)
1824{
1825 struct rtable *rt = (struct rtable *) dst;
1826 struct inet_peer *peer = rt->peer;
1827
1828 if (rt->fi) {
1829 fib_info_put(rt->fi);
1830 rt->fi = NULL;
1831 }
1832 if (peer) {
1833 rt->peer = NULL;
1834 inet_putpeer(peer);
1835 }
1836}
1837
1838
1029static void ipv4_link_failure(struct sk_buff *skb) 1839static void ipv4_link_failure(struct sk_buff *skb)
1030{ 1840{
1031 struct rtable *rt; 1841 struct rtable *rt;
@@ -1033,15 +1843,15 @@ static void ipv4_link_failure(struct sk_buff *skb)
1033 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); 1843 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1034 1844
1035 rt = skb_rtable(skb); 1845 rt = skb_rtable(skb);
1036 if (rt) 1846 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1037 dst_set_expires(&rt->dst, 0); 1847 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1038} 1848}
1039 1849
1040static int ip_rt_bug(struct sk_buff *skb) 1850static int ip_rt_bug(struct sk_buff *skb)
1041{ 1851{
1042 pr_debug("%s: %pI4 -> %pI4, %s\n", 1852 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1043 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, 1853 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1044 skb->dev ? skb->dev->name : "?"); 1854 skb->dev ? skb->dev->name : "?");
1045 kfree_skb(skb); 1855 kfree_skb(skb);
1046 WARN_ON(1); 1856 WARN_ON(1);
1047 return 0; 1857 return 0;
@@ -1081,9 +1891,8 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1081 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) 1891 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1082 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); 1892 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1083 else 1893 else
1084 src = inet_select_addr(rt->dst.dev, 1894 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1085 rt_nexthop(rt, iph->daddr), 1895 RT_SCOPE_UNIVERSE);
1086 RT_SCOPE_UNIVERSE);
1087 rcu_read_unlock(); 1896 rcu_read_unlock();
1088 } 1897 }
1089 memcpy(addr, &src, 4); 1898 memcpy(addr, &src, 4);
@@ -1112,21 +1921,14 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1112 return advmss; 1921 return advmss;
1113} 1922}
1114 1923
1115static unsigned int ipv4_mtu(const struct dst_entry *dst) 1924static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1116{ 1925{
1117 const struct rtable *rt = (const struct rtable *) dst; 1926 unsigned int mtu = dst->dev->mtu;
1118 unsigned int mtu = rt->rt_pmtu;
1119
1120 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1121 mtu = dst_metric_raw(dst, RTAX_MTU);
1122
1123 if (mtu && rt_is_output_route(rt))
1124 return mtu;
1125
1126 mtu = dst->dev->mtu;
1127 1927
1128 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { 1928 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1129 if (rt->rt_uses_gateway && mtu > 576) 1929 const struct rtable *rt = (const struct rtable *) dst;
1930
1931 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1130 mtu = 576; 1932 mtu = 576;
1131 } 1933 }
1132 1934
@@ -1136,184 +1938,77 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
1136 return mtu; 1938 return mtu;
1137} 1939}
1138 1940
1139static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) 1941static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1942 struct fib_info *fi)
1140{ 1943{
1141 struct fnhe_hash_bucket *hash = nh->nh_exceptions; 1944 struct inet_peer *peer;
1142 struct fib_nh_exception *fnhe; 1945 int create = 0;
1143 u32 hval;
1144
1145 if (!hash)
1146 return NULL;
1147
1148 hval = fnhe_hashfun(daddr);
1149
1150 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1151 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1152 if (fnhe->fnhe_daddr == daddr)
1153 return fnhe;
1154 }
1155 return NULL;
1156}
1157
1158static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1159 __be32 daddr)
1160{
1161 bool ret = false;
1162
1163 spin_lock_bh(&fnhe_lock);
1164 1946
1165 if (daddr == fnhe->fnhe_daddr) { 1947 /* If a peer entry exists for this destination, we must hook
1166 struct rtable *orig = rcu_dereference(fnhe->fnhe_rth); 1948 * it up in order to get at cached metrics.
1167 if (orig && rt_is_expired(orig)) { 1949 */
1168 fnhe->fnhe_gw = 0; 1950 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1169 fnhe->fnhe_pmtu = 0; 1951 create = 1;
1170 fnhe->fnhe_expires = 0;
1171 }
1172 if (fnhe->fnhe_pmtu) {
1173 unsigned long expires = fnhe->fnhe_expires;
1174 unsigned long diff = expires - jiffies;
1175 1952
1176 if (time_before(jiffies, expires)) { 1953 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1177 rt->rt_pmtu = fnhe->fnhe_pmtu; 1954 if (peer) {
1178 dst_set_expires(&rt->dst, diff); 1955 rt->rt_peer_genid = rt_peer_genid();
1179 } 1956 if (inet_metrics_new(peer))
1180 } 1957 memcpy(peer->metrics, fi->fib_metrics,
1181 if (fnhe->fnhe_gw) { 1958 sizeof(u32) * RTAX_MAX);
1959 dst_init_metrics(&rt->dst, peer->metrics, false);
1960
1961 check_peer_pmtu(&rt->dst, peer);
1962 if (peer->redirect_genid != redirect_genid)
1963 peer->redirect_learned.a4 = 0;
1964 if (peer->redirect_learned.a4 &&
1965 peer->redirect_learned.a4 != rt->rt_gateway) {
1966 rt->rt_gateway = peer->redirect_learned.a4;
1182 rt->rt_flags |= RTCF_REDIRECTED; 1967 rt->rt_flags |= RTCF_REDIRECTED;
1183 rt->rt_gateway = fnhe->fnhe_gw; 1968 }
1184 rt->rt_uses_gateway = 1;
1185 } else if (!rt->rt_gateway)
1186 rt->rt_gateway = daddr;
1187
1188 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1189 if (orig)
1190 rt_free(orig);
1191
1192 fnhe->fnhe_stamp = jiffies;
1193 ret = true;
1194 }
1195 spin_unlock_bh(&fnhe_lock);
1196
1197 return ret;
1198}
1199
1200static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1201{
1202 struct rtable *orig, *prev, **p;
1203 bool ret = true;
1204
1205 if (rt_is_input_route(rt)) {
1206 p = (struct rtable **)&nh->nh_rth_input;
1207 } else { 1969 } else {
1208 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output); 1970 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1209 } 1971 rt->fi = fi;
1210 orig = *p; 1972 atomic_inc(&fi->fib_clntref);
1211
1212 prev = cmpxchg(p, orig, rt);
1213 if (prev == orig) {
1214 if (orig)
1215 rt_free(orig);
1216 } else
1217 ret = false;
1218
1219 return ret;
1220}
1221
1222static DEFINE_SPINLOCK(rt_uncached_lock);
1223static LIST_HEAD(rt_uncached_list);
1224
1225static void rt_add_uncached_list(struct rtable *rt)
1226{
1227 spin_lock_bh(&rt_uncached_lock);
1228 list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1229 spin_unlock_bh(&rt_uncached_lock);
1230}
1231
1232static void ipv4_dst_destroy(struct dst_entry *dst)
1233{
1234 struct rtable *rt = (struct rtable *) dst;
1235
1236 if (!list_empty(&rt->rt_uncached)) {
1237 spin_lock_bh(&rt_uncached_lock);
1238 list_del(&rt->rt_uncached);
1239 spin_unlock_bh(&rt_uncached_lock);
1240 }
1241}
1242
1243void rt_flush_dev(struct net_device *dev)
1244{
1245 if (!list_empty(&rt_uncached_list)) {
1246 struct net *net = dev_net(dev);
1247 struct rtable *rt;
1248
1249 spin_lock_bh(&rt_uncached_lock);
1250 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1251 if (rt->dst.dev != dev)
1252 continue;
1253 rt->dst.dev = net->loopback_dev;
1254 dev_hold(rt->dst.dev);
1255 dev_put(dev);
1256 } 1973 }
1257 spin_unlock_bh(&rt_uncached_lock); 1974 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1258 } 1975 }
1259} 1976}
1260 1977
1261static bool rt_cache_valid(const struct rtable *rt) 1978static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1262{
1263 return rt &&
1264 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1265 !rt_is_expired(rt);
1266}
1267
1268static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1269 const struct fib_result *res, 1979 const struct fib_result *res,
1270 struct fib_nh_exception *fnhe,
1271 struct fib_info *fi, u16 type, u32 itag) 1980 struct fib_info *fi, u16 type, u32 itag)
1272{ 1981{
1273 bool cached = false; 1982 struct dst_entry *dst = &rt->dst;
1274 1983
1275 if (fi) { 1984 if (fi) {
1276 struct fib_nh *nh = &FIB_RES_NH(*res); 1985 if (FIB_RES_GW(*res) &&
1277 1986 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1278 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) { 1987 rt->rt_gateway = FIB_RES_GW(*res);
1279 rt->rt_gateway = nh->nh_gw; 1988 rt_init_metrics(rt, fl4, fi);
1280 rt->rt_uses_gateway = 1;
1281 }
1282 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1283#ifdef CONFIG_IP_ROUTE_CLASSID 1989#ifdef CONFIG_IP_ROUTE_CLASSID
1284 rt->dst.tclassid = nh->nh_tclassid; 1990 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1285#endif 1991#endif
1286 if (unlikely(fnhe)) 1992 }
1287 cached = rt_bind_exception(rt, fnhe, daddr); 1993
1288 else if (!(rt->dst.flags & DST_NOCACHE)) 1994 if (dst_mtu(dst) > IP_MAX_MTU)
1289 cached = rt_cache_route(nh, rt); 1995 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1290 if (unlikely(!cached)) { 1996 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1291 /* Routes we intend to cache in nexthop exception or 1997 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1292 * FIB nexthop have the DST_NOCACHE bit clear.
1293 * However, if we are unsuccessful at storing this
1294 * route into the cache we really need to set it.
1295 */
1296 rt->dst.flags |= DST_NOCACHE;
1297 if (!rt->rt_gateway)
1298 rt->rt_gateway = daddr;
1299 rt_add_uncached_list(rt);
1300 }
1301 } else
1302 rt_add_uncached_list(rt);
1303 1998
1304#ifdef CONFIG_IP_ROUTE_CLASSID 1999#ifdef CONFIG_IP_ROUTE_CLASSID
1305#ifdef CONFIG_IP_MULTIPLE_TABLES 2000#ifdef CONFIG_IP_MULTIPLE_TABLES
1306 set_class_tag(rt, res->tclassid); 2001 set_class_tag(rt, fib_rules_tclass(res));
1307#endif 2002#endif
1308 set_class_tag(rt, itag); 2003 set_class_tag(rt, itag);
1309#endif 2004#endif
1310} 2005}
1311 2006
1312static struct rtable *rt_dst_alloc(struct net_device *dev, 2007static struct rtable *rt_dst_alloc(struct net_device *dev,
1313 bool nopolicy, bool noxfrm, bool will_cache) 2008 bool nopolicy, bool noxfrm)
1314{ 2009{
1315 return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, 2010 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1316 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) | 2011 DST_HOST |
1317 (nopolicy ? DST_NOPOLICY : 0) | 2012 (nopolicy ? DST_NOPOLICY : 0) |
1318 (noxfrm ? DST_NOXFRM : 0)); 2013 (noxfrm ? DST_NOXFRM : 0));
1319} 2014}
@@ -1322,7 +2017,9 @@ static struct rtable *rt_dst_alloc(struct net_device *dev,
1322static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2017static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1323 u8 tos, struct net_device *dev, int our) 2018 u8 tos, struct net_device *dev, int our)
1324{ 2019{
2020 unsigned int hash;
1325 struct rtable *rth; 2021 struct rtable *rth;
2022 __be32 spec_dst;
1326 struct in_device *in_dev = __in_dev_get_rcu(dev); 2023 struct in_device *in_dev = __in_dev_get_rcu(dev);
1327 u32 itag = 0; 2024 u32 itag = 0;
1328 int err; 2025 int err;
@@ -1333,24 +2030,21 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1333 return -EINVAL; 2030 return -EINVAL;
1334 2031
1335 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || 2032 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1336 skb->protocol != htons(ETH_P_IP)) 2033 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1337 goto e_inval; 2034 goto e_inval;
1338 2035
1339 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1340 if (ipv4_is_loopback(saddr))
1341 goto e_inval;
1342
1343 if (ipv4_is_zeronet(saddr)) { 2036 if (ipv4_is_zeronet(saddr)) {
1344 if (!ipv4_is_local_multicast(daddr)) 2037 if (!ipv4_is_local_multicast(daddr))
1345 goto e_inval; 2038 goto e_inval;
2039 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1346 } else { 2040 } else {
1347 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, 2041 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1348 in_dev, &itag); 2042 &itag);
1349 if (err < 0) 2043 if (err < 0)
1350 goto e_err; 2044 goto e_err;
1351 } 2045 }
1352 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, 2046 rth = rt_dst_alloc(init_net.loopback_dev,
1353 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); 2047 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1354 if (!rth) 2048 if (!rth)
1355 goto e_nobufs; 2049 goto e_nobufs;
1356 2050
@@ -1359,15 +2053,23 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1359#endif 2053#endif
1360 rth->dst.output = ip_rt_bug; 2054 rth->dst.output = ip_rt_bug;
1361 2055
2056 rth->rt_key_dst = daddr;
2057 rth->rt_key_src = saddr;
1362 rth->rt_genid = rt_genid(dev_net(dev)); 2058 rth->rt_genid = rt_genid(dev_net(dev));
1363 rth->rt_flags = RTCF_MULTICAST; 2059 rth->rt_flags = RTCF_MULTICAST;
1364 rth->rt_type = RTN_MULTICAST; 2060 rth->rt_type = RTN_MULTICAST;
1365 rth->rt_is_input= 1; 2061 rth->rt_key_tos = tos;
1366 rth->rt_iif = 0; 2062 rth->rt_dst = daddr;
1367 rth->rt_pmtu = 0; 2063 rth->rt_src = saddr;
1368 rth->rt_gateway = 0; 2064 rth->rt_route_iif = dev->ifindex;
1369 rth->rt_uses_gateway = 0; 2065 rth->rt_iif = dev->ifindex;
1370 INIT_LIST_HEAD(&rth->rt_uncached); 2066 rth->rt_oif = 0;
2067 rth->rt_mark = skb->mark;
2068 rth->rt_gateway = daddr;
2069 rth->rt_spec_dst= spec_dst;
2070 rth->rt_peer_genid = 0;
2071 rth->peer = NULL;
2072 rth->fi = NULL;
1371 if (our) { 2073 if (our) {
1372 rth->dst.input= ip_local_deliver; 2074 rth->dst.input= ip_local_deliver;
1373 rth->rt_flags |= RTCF_LOCAL; 2075 rth->rt_flags |= RTCF_LOCAL;
@@ -1379,8 +2081,9 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1379#endif 2081#endif
1380 RT_CACHE_STAT_INC(in_slow_mc); 2082 RT_CACHE_STAT_INC(in_slow_mc);
1381 2083
1382 skb_dst_set(skb, &rth->dst); 2084 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1383 return 0; 2085 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2086 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1384 2087
1385e_nobufs: 2088e_nobufs:
1386 return -ENOBUFS; 2089 return -ENOBUFS;
@@ -1404,13 +2107,18 @@ static void ip_handle_martian_source(struct net_device *dev,
1404 * RFC1812 recommendation, if source is martian, 2107 * RFC1812 recommendation, if source is martian,
1405 * the only hint is MAC header. 2108 * the only hint is MAC header.
1406 */ 2109 */
1407 pr_warn("martian source %pI4 from %pI4, on dev %s\n", 2110 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1408 &daddr, &saddr, dev->name); 2111 &daddr, &saddr, dev->name);
1409 if (dev->hard_header_len && skb_mac_header_was_set(skb)) { 2112 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1410 print_hex_dump(KERN_WARNING, "ll header: ", 2113 int i;
1411 DUMP_PREFIX_OFFSET, 16, 1, 2114 const unsigned char *p = skb_mac_header(skb);
1412 skb_mac_header(skb), 2115 printk(KERN_WARNING "ll header: ");
1413 dev->hard_header_len, true); 2116 for (i = 0; i < dev->hard_header_len; i++, p++) {
2117 printk("%02x", *p);
2118 if (i < (dev->hard_header_len - 1))
2119 printk(":");
2120 }
2121 printk("\n");
1414 } 2122 }
1415 } 2123 }
1416#endif 2124#endif
@@ -1420,24 +2128,28 @@ static void ip_handle_martian_source(struct net_device *dev,
1420static int __mkroute_input(struct sk_buff *skb, 2128static int __mkroute_input(struct sk_buff *skb,
1421 const struct fib_result *res, 2129 const struct fib_result *res,
1422 struct in_device *in_dev, 2130 struct in_device *in_dev,
1423 __be32 daddr, __be32 saddr, u32 tos) 2131 __be32 daddr, __be32 saddr, u32 tos,
2132 struct rtable **result)
1424{ 2133{
1425 struct rtable *rth; 2134 struct rtable *rth;
1426 int err; 2135 int err;
1427 struct in_device *out_dev; 2136 struct in_device *out_dev;
1428 unsigned int flags = 0; 2137 unsigned int flags = 0;
1429 bool do_cache; 2138 __be32 spec_dst;
1430 u32 itag; 2139 u32 itag;
1431 2140
1432 /* get a working reference to the output device */ 2141 /* get a working reference to the output device */
1433 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res)); 2142 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1434 if (out_dev == NULL) { 2143 if (out_dev == NULL) {
1435 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n"); 2144 if (net_ratelimit())
2145 printk(KERN_CRIT "Bug in ip_route_input" \
2146 "_slow(). Please, report\n");
1436 return -EINVAL; 2147 return -EINVAL;
1437 } 2148 }
1438 2149
2150
1439 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), 2151 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1440 in_dev->dev, in_dev, &itag); 2152 in_dev->dev, &spec_dst, &itag);
1441 if (err < 0) { 2153 if (err < 0) {
1442 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 2154 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1443 saddr); 2155 saddr);
@@ -1445,13 +2157,13 @@ static int __mkroute_input(struct sk_buff *skb,
1445 goto cleanup; 2157 goto cleanup;
1446 } 2158 }
1447 2159
1448 do_cache = res->fi && !itag; 2160 if (err)
1449 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && 2161 flags |= RTCF_DIRECTSRC;
2162
2163 if (out_dev == in_dev && err &&
1450 (IN_DEV_SHARED_MEDIA(out_dev) || 2164 (IN_DEV_SHARED_MEDIA(out_dev) ||
1451 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) { 2165 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1452 flags |= RTCF_DOREDIRECT; 2166 flags |= RTCF_DOREDIRECT;
1453 do_cache = false;
1454 }
1455 2167
1456 if (skb->protocol != htons(ETH_P_IP)) { 2168 if (skb->protocol != htons(ETH_P_IP)) {
1457 /* Not IP (i.e. ARP). Do not create route, if it is 2169 /* Not IP (i.e. ARP). Do not create route, if it is
@@ -1468,38 +2180,38 @@ static int __mkroute_input(struct sk_buff *skb,
1468 } 2180 }
1469 } 2181 }
1470 2182
1471 if (do_cache) {
1472 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1473 if (rt_cache_valid(rth)) {
1474 skb_dst_set_noref(skb, &rth->dst);
1475 goto out;
1476 }
1477 }
1478
1479 rth = rt_dst_alloc(out_dev->dev, 2183 rth = rt_dst_alloc(out_dev->dev,
1480 IN_DEV_CONF_GET(in_dev, NOPOLICY), 2184 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1481 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache); 2185 IN_DEV_CONF_GET(out_dev, NOXFRM));
1482 if (!rth) { 2186 if (!rth) {
1483 err = -ENOBUFS; 2187 err = -ENOBUFS;
1484 goto cleanup; 2188 goto cleanup;
1485 } 2189 }
1486 2190
2191 rth->rt_key_dst = daddr;
2192 rth->rt_key_src = saddr;
1487 rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); 2193 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1488 rth->rt_flags = flags; 2194 rth->rt_flags = flags;
1489 rth->rt_type = res->type; 2195 rth->rt_type = res->type;
1490 rth->rt_is_input = 1; 2196 rth->rt_key_tos = tos;
1491 rth->rt_iif = 0; 2197 rth->rt_dst = daddr;
1492 rth->rt_pmtu = 0; 2198 rth->rt_src = saddr;
1493 rth->rt_gateway = 0; 2199 rth->rt_route_iif = in_dev->dev->ifindex;
1494 rth->rt_uses_gateway = 0; 2200 rth->rt_iif = in_dev->dev->ifindex;
1495 INIT_LIST_HEAD(&rth->rt_uncached); 2201 rth->rt_oif = 0;
2202 rth->rt_mark = skb->mark;
2203 rth->rt_gateway = daddr;
2204 rth->rt_spec_dst= spec_dst;
2205 rth->rt_peer_genid = 0;
2206 rth->peer = NULL;
2207 rth->fi = NULL;
1496 2208
1497 rth->dst.input = ip_forward; 2209 rth->dst.input = ip_forward;
1498 rth->dst.output = ip_output; 2210 rth->dst.output = ip_output;
1499 2211
1500 rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag); 2212 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
1501 skb_dst_set(skb, &rth->dst); 2213
1502out: 2214 *result = rth;
1503 err = 0; 2215 err = 0;
1504 cleanup: 2216 cleanup:
1505 return err; 2217 return err;
@@ -1511,13 +2223,27 @@ static int ip_mkroute_input(struct sk_buff *skb,
1511 struct in_device *in_dev, 2223 struct in_device *in_dev,
1512 __be32 daddr, __be32 saddr, u32 tos) 2224 __be32 daddr, __be32 saddr, u32 tos)
1513{ 2225{
2226 struct rtable* rth = NULL;
2227 int err;
2228 unsigned hash;
2229
1514#ifdef CONFIG_IP_ROUTE_MULTIPATH 2230#ifdef CONFIG_IP_ROUTE_MULTIPATH
1515 if (res->fi && res->fi->fib_nhs > 1) 2231 if (res->fi && res->fi->fib_nhs > 1)
1516 fib_select_multipath(res); 2232 fib_select_multipath(res);
1517#endif 2233#endif
1518 2234
1519 /* create a routing cache entry */ 2235 /* create a routing cache entry */
1520 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos); 2236 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2237 if (err)
2238 return err;
2239
2240 /* put it into the cache */
2241 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2242 rt_genid(dev_net(rth->dst.dev)));
2243 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2244 if (IS_ERR(rth))
2245 return PTR_ERR(rth);
2246 return 0;
1521} 2247}
1522 2248
1523/* 2249/*
@@ -1537,12 +2263,13 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1537 struct fib_result res; 2263 struct fib_result res;
1538 struct in_device *in_dev = __in_dev_get_rcu(dev); 2264 struct in_device *in_dev = __in_dev_get_rcu(dev);
1539 struct flowi4 fl4; 2265 struct flowi4 fl4;
1540 unsigned int flags = 0; 2266 unsigned flags = 0;
1541 u32 itag = 0; 2267 u32 itag = 0;
1542 struct rtable *rth; 2268 struct rtable * rth;
2269 unsigned hash;
2270 __be32 spec_dst;
1543 int err = -EINVAL; 2271 int err = -EINVAL;
1544 struct net *net = dev_net(dev); 2272 struct net * net = dev_net(dev);
1545 bool do_cache;
1546 2273
1547 /* IP on this device is disabled. */ 2274 /* IP on this device is disabled. */
1548 2275
@@ -1553,10 +2280,10 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1553 by fib_lookup. 2280 by fib_lookup.
1554 */ 2281 */
1555 2282
1556 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) 2283 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2284 ipv4_is_loopback(saddr))
1557 goto martian_source; 2285 goto martian_source;
1558 2286
1559 res.fi = NULL;
1560 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) 2287 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1561 goto brd_input; 2288 goto brd_input;
1562 2289
@@ -1566,20 +2293,9 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1566 if (ipv4_is_zeronet(saddr)) 2293 if (ipv4_is_zeronet(saddr))
1567 goto martian_source; 2294 goto martian_source;
1568 2295
1569 if (ipv4_is_zeronet(daddr)) 2296 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
1570 goto martian_destination; 2297 goto martian_destination;
1571 2298
1572 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1573 * and call it once if daddr or/and saddr are loopback addresses
1574 */
1575 if (ipv4_is_loopback(daddr)) {
1576 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1577 goto martian_destination;
1578 } else if (ipv4_is_loopback(saddr)) {
1579 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1580 goto martian_source;
1581 }
1582
1583 /* 2299 /*
1584 * Now we are ready to route packet. 2300 * Now we are ready to route packet.
1585 */ 2301 */
@@ -1591,8 +2307,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1591 fl4.daddr = daddr; 2307 fl4.daddr = daddr;
1592 fl4.saddr = saddr; 2308 fl4.saddr = saddr;
1593 err = fib_lookup(net, &fl4, &res); 2309 err = fib_lookup(net, &fl4, &res);
1594 if (err != 0) 2310 if (err != 0) {
2311 if (!IN_DEV_FORWARD(in_dev))
2312 goto e_hostunreach;
1595 goto no_route; 2313 goto no_route;
2314 }
1596 2315
1597 RT_CACHE_STAT_INC(in_slow_tot); 2316 RT_CACHE_STAT_INC(in_slow_tot);
1598 2317
@@ -1601,15 +2320,18 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1601 2320
1602 if (res.type == RTN_LOCAL) { 2321 if (res.type == RTN_LOCAL) {
1603 err = fib_validate_source(skb, saddr, daddr, tos, 2322 err = fib_validate_source(skb, saddr, daddr, tos,
1604 LOOPBACK_IFINDEX, 2323 net->loopback_dev->ifindex,
1605 dev, in_dev, &itag); 2324 dev, &spec_dst, &itag);
1606 if (err < 0) 2325 if (err < 0)
1607 goto martian_source_keep_err; 2326 goto martian_source_keep_err;
2327 if (err)
2328 flags |= RTCF_DIRECTSRC;
2329 spec_dst = daddr;
1608 goto local_input; 2330 goto local_input;
1609 } 2331 }
1610 2332
1611 if (!IN_DEV_FORWARD(in_dev)) 2333 if (!IN_DEV_FORWARD(in_dev))
1612 goto no_route; 2334 goto e_hostunreach;
1613 if (res.type != RTN_UNICAST) 2335 if (res.type != RTN_UNICAST)
1614 goto martian_destination; 2336 goto martian_destination;
1615 2337
@@ -1620,32 +2342,23 @@ brd_input:
1620 if (skb->protocol != htons(ETH_P_IP)) 2342 if (skb->protocol != htons(ETH_P_IP))
1621 goto e_inval; 2343 goto e_inval;
1622 2344
1623 if (!ipv4_is_zeronet(saddr)) { 2345 if (ipv4_is_zeronet(saddr))
1624 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, 2346 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1625 in_dev, &itag); 2347 else {
2348 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2349 &itag);
1626 if (err < 0) 2350 if (err < 0)
1627 goto martian_source_keep_err; 2351 goto martian_source_keep_err;
2352 if (err)
2353 flags |= RTCF_DIRECTSRC;
1628 } 2354 }
1629 flags |= RTCF_BROADCAST; 2355 flags |= RTCF_BROADCAST;
1630 res.type = RTN_BROADCAST; 2356 res.type = RTN_BROADCAST;
1631 RT_CACHE_STAT_INC(in_brd); 2357 RT_CACHE_STAT_INC(in_brd);
1632 2358
1633local_input: 2359local_input:
1634 do_cache = false;
1635 if (res.fi) {
1636 if (!itag) {
1637 rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1638 if (rt_cache_valid(rth)) {
1639 skb_dst_set_noref(skb, &rth->dst);
1640 err = 0;
1641 goto out;
1642 }
1643 do_cache = true;
1644 }
1645 }
1646
1647 rth = rt_dst_alloc(net->loopback_dev, 2360 rth = rt_dst_alloc(net->loopback_dev,
1648 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache); 2361 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1649 if (!rth) 2362 if (!rth)
1650 goto e_nobufs; 2363 goto e_nobufs;
1651 2364
@@ -1655,28 +2368,41 @@ local_input:
1655 rth->dst.tclassid = itag; 2368 rth->dst.tclassid = itag;
1656#endif 2369#endif
1657 2370
2371 rth->rt_key_dst = daddr;
2372 rth->rt_key_src = saddr;
1658 rth->rt_genid = rt_genid(net); 2373 rth->rt_genid = rt_genid(net);
1659 rth->rt_flags = flags|RTCF_LOCAL; 2374 rth->rt_flags = flags|RTCF_LOCAL;
1660 rth->rt_type = res.type; 2375 rth->rt_type = res.type;
1661 rth->rt_is_input = 1; 2376 rth->rt_key_tos = tos;
1662 rth->rt_iif = 0; 2377 rth->rt_dst = daddr;
1663 rth->rt_pmtu = 0; 2378 rth->rt_src = saddr;
1664 rth->rt_gateway = 0; 2379#ifdef CONFIG_IP_ROUTE_CLASSID
1665 rth->rt_uses_gateway = 0; 2380 rth->dst.tclassid = itag;
1666 INIT_LIST_HEAD(&rth->rt_uncached); 2381#endif
2382 rth->rt_route_iif = dev->ifindex;
2383 rth->rt_iif = dev->ifindex;
2384 rth->rt_oif = 0;
2385 rth->rt_mark = skb->mark;
2386 rth->rt_gateway = daddr;
2387 rth->rt_spec_dst= spec_dst;
2388 rth->rt_peer_genid = 0;
2389 rth->peer = NULL;
2390 rth->fi = NULL;
1667 if (res.type == RTN_UNREACHABLE) { 2391 if (res.type == RTN_UNREACHABLE) {
1668 rth->dst.input= ip_error; 2392 rth->dst.input= ip_error;
1669 rth->dst.error= -err; 2393 rth->dst.error= -err;
1670 rth->rt_flags &= ~RTCF_LOCAL; 2394 rth->rt_flags &= ~RTCF_LOCAL;
1671 } 2395 }
1672 if (do_cache) 2396 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
1673 rt_cache_route(&FIB_RES_NH(res), rth); 2397 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
1674 skb_dst_set(skb, &rth->dst);
1675 err = 0; 2398 err = 0;
2399 if (IS_ERR(rth))
2400 err = PTR_ERR(rth);
1676 goto out; 2401 goto out;
1677 2402
1678no_route: 2403no_route:
1679 RT_CACHE_STAT_INC(in_no_route); 2404 RT_CACHE_STAT_INC(in_no_route);
2405 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1680 res.type = RTN_UNREACHABLE; 2406 res.type = RTN_UNREACHABLE;
1681 if (err == -ESRCH) 2407 if (err == -ESRCH)
1682 err = -ENETUNREACH; 2408 err = -ENETUNREACH;
@@ -1688,11 +2414,15 @@ no_route:
1688martian_destination: 2414martian_destination:
1689 RT_CACHE_STAT_INC(in_martian_dst); 2415 RT_CACHE_STAT_INC(in_martian_dst);
1690#ifdef CONFIG_IP_ROUTE_VERBOSE 2416#ifdef CONFIG_IP_ROUTE_VERBOSE
1691 if (IN_DEV_LOG_MARTIANS(in_dev)) 2417 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1692 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n", 2418 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
1693 &daddr, &saddr, dev->name); 2419 &daddr, &saddr, dev->name);
1694#endif 2420#endif
1695 2421
2422e_hostunreach:
2423 err = -EHOSTUNREACH;
2424 goto out;
2425
1696e_inval: 2426e_inval:
1697 err = -EINVAL; 2427 err = -EINVAL;
1698 goto out; 2428 goto out;
@@ -1708,13 +2438,50 @@ martian_source_keep_err:
1708 goto out; 2438 goto out;
1709} 2439}
1710 2440
1711int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2441int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1712 u8 tos, struct net_device *dev) 2442 u8 tos, struct net_device *dev, bool noref)
1713{ 2443{
2444 struct rtable * rth;
2445 unsigned hash;
2446 int iif = dev->ifindex;
2447 struct net *net;
1714 int res; 2448 int res;
1715 2449
2450 net = dev_net(dev);
2451
1716 rcu_read_lock(); 2452 rcu_read_lock();
1717 2453
2454 if (!rt_caching(net))
2455 goto skip_cache;
2456
2457 tos &= IPTOS_RT_MASK;
2458 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2459
2460 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2461 rth = rcu_dereference(rth->dst.rt_next)) {
2462 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2463 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2464 (rth->rt_route_iif ^ iif) |
2465 (rth->rt_key_tos ^ tos)) == 0 &&
2466 rth->rt_mark == skb->mark &&
2467 net_eq(dev_net(rth->dst.dev), net) &&
2468 !rt_is_expired(rth)) {
2469 ipv4_validate_peer(rth);
2470 if (noref) {
2471 dst_use_noref(&rth->dst, jiffies);
2472 skb_dst_set_noref(skb, &rth->dst);
2473 } else {
2474 dst_use(&rth->dst, jiffies);
2475 skb_dst_set(skb, &rth->dst);
2476 }
2477 RT_CACHE_STAT_INC(in_hit);
2478 rcu_read_unlock();
2479 return 0;
2480 }
2481 RT_CACHE_STAT_INC(in_hlist_search);
2482 }
2483
2484skip_cache:
1718 /* Multicast recognition logic is moved from route cache to here. 2485 /* Multicast recognition logic is moved from route cache to here.
1719 The problem was that too many Ethernet cards have broken/missing 2486 The problem was that too many Ethernet cards have broken/missing
1720 hardware multicast filters :-( As result the host on multicasting 2487 hardware multicast filters :-( As result the host on multicasting
@@ -1752,29 +2519,24 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1752 rcu_read_unlock(); 2519 rcu_read_unlock();
1753 return res; 2520 return res;
1754} 2521}
1755EXPORT_SYMBOL(ip_route_input_noref); 2522EXPORT_SYMBOL(ip_route_input_common);
1756 2523
1757/* called with rcu_read_lock() */ 2524/* called with rcu_read_lock() */
1758static struct rtable *__mkroute_output(const struct fib_result *res, 2525static struct rtable *__mkroute_output(const struct fib_result *res,
1759 const struct flowi4 *fl4, int orig_oif, 2526 const struct flowi4 *fl4,
2527 __be32 orig_daddr, __be32 orig_saddr,
2528 int orig_oif, __u8 orig_rtos,
1760 struct net_device *dev_out, 2529 struct net_device *dev_out,
1761 unsigned int flags) 2530 unsigned int flags)
1762{ 2531{
1763 struct fib_info *fi = res->fi; 2532 struct fib_info *fi = res->fi;
1764 struct fib_nh_exception *fnhe;
1765 struct in_device *in_dev; 2533 struct in_device *in_dev;
1766 u16 type = res->type; 2534 u16 type = res->type;
1767 struct rtable *rth; 2535 struct rtable *rth;
1768 bool do_cache;
1769 2536
1770 in_dev = __in_dev_get_rcu(dev_out); 2537 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1771 if (!in_dev)
1772 return ERR_PTR(-EINVAL); 2538 return ERR_PTR(-EINVAL);
1773 2539
1774 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1775 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1776 return ERR_PTR(-EINVAL);
1777
1778 if (ipv4_is_lbcast(fl4->daddr)) 2540 if (ipv4_is_lbcast(fl4->daddr))
1779 type = RTN_BROADCAST; 2541 type = RTN_BROADCAST;
1780 else if (ipv4_is_multicast(fl4->daddr)) 2542 else if (ipv4_is_multicast(fl4->daddr))
@@ -1785,7 +2547,10 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
1785 if (dev_out->flags & IFF_LOOPBACK) 2547 if (dev_out->flags & IFF_LOOPBACK)
1786 flags |= RTCF_LOCAL; 2548 flags |= RTCF_LOCAL;
1787 2549
1788 do_cache = true; 2550 in_dev = __in_dev_get_rcu(dev_out);
2551 if (!in_dev)
2552 return ERR_PTR(-EINVAL);
2553
1789 if (type == RTN_BROADCAST) { 2554 if (type == RTN_BROADCAST) {
1790 flags |= RTCF_BROADCAST | RTCF_LOCAL; 2555 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1791 fi = NULL; 2556 fi = NULL;
@@ -1794,8 +2559,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
1794 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, 2559 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1795 fl4->flowi4_proto)) 2560 fl4->flowi4_proto))
1796 flags &= ~RTCF_LOCAL; 2561 flags &= ~RTCF_LOCAL;
1797 else
1798 do_cache = false;
1799 /* If multicast route do not exist use 2562 /* If multicast route do not exist use
1800 * default one, but do not gateway in this case. 2563 * default one, but do not gateway in this case.
1801 * Yes, it is hack. 2564 * Yes, it is hack.
@@ -1804,57 +2567,40 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
1804 fi = NULL; 2567 fi = NULL;
1805 } 2568 }
1806 2569
1807 fnhe = NULL;
1808 do_cache &= fi != NULL;
1809 if (do_cache) {
1810 struct rtable __rcu **prth;
1811 struct fib_nh *nh = &FIB_RES_NH(*res);
1812
1813 fnhe = find_exception(nh, fl4->daddr);
1814 if (fnhe)
1815 prth = &fnhe->fnhe_rth;
1816 else {
1817 if (unlikely(fl4->flowi4_flags &
1818 FLOWI_FLAG_KNOWN_NH &&
1819 !(nh->nh_gw &&
1820 nh->nh_scope == RT_SCOPE_LINK))) {
1821 do_cache = false;
1822 goto add;
1823 }
1824 prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1825 }
1826 rth = rcu_dereference(*prth);
1827 if (rt_cache_valid(rth)) {
1828 dst_hold(&rth->dst);
1829 return rth;
1830 }
1831 }
1832
1833add:
1834 rth = rt_dst_alloc(dev_out, 2570 rth = rt_dst_alloc(dev_out,
1835 IN_DEV_CONF_GET(in_dev, NOPOLICY), 2571 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1836 IN_DEV_CONF_GET(in_dev, NOXFRM), 2572 IN_DEV_CONF_GET(in_dev, NOXFRM));
1837 do_cache);
1838 if (!rth) 2573 if (!rth)
1839 return ERR_PTR(-ENOBUFS); 2574 return ERR_PTR(-ENOBUFS);
1840 2575
1841 rth->dst.output = ip_output; 2576 rth->dst.output = ip_output;
1842 2577
2578 rth->rt_key_dst = orig_daddr;
2579 rth->rt_key_src = orig_saddr;
1843 rth->rt_genid = rt_genid(dev_net(dev_out)); 2580 rth->rt_genid = rt_genid(dev_net(dev_out));
1844 rth->rt_flags = flags; 2581 rth->rt_flags = flags;
1845 rth->rt_type = type; 2582 rth->rt_type = type;
1846 rth->rt_is_input = 0; 2583 rth->rt_key_tos = orig_rtos;
1847 rth->rt_iif = orig_oif ? : 0; 2584 rth->rt_dst = fl4->daddr;
1848 rth->rt_pmtu = 0; 2585 rth->rt_src = fl4->saddr;
1849 rth->rt_gateway = 0; 2586 rth->rt_route_iif = 0;
1850 rth->rt_uses_gateway = 0; 2587 rth->rt_iif = orig_oif ? : dev_out->ifindex;
1851 INIT_LIST_HEAD(&rth->rt_uncached); 2588 rth->rt_oif = orig_oif;
2589 rth->rt_mark = fl4->flowi4_mark;
2590 rth->rt_gateway = fl4->daddr;
2591 rth->rt_spec_dst= fl4->saddr;
2592 rth->rt_peer_genid = 0;
2593 rth->peer = NULL;
2594 rth->fi = NULL;
1852 2595
1853 RT_CACHE_STAT_INC(out_slow_tot); 2596 RT_CACHE_STAT_INC(out_slow_tot);
1854 2597
1855 if (flags & RTCF_LOCAL) 2598 if (flags & RTCF_LOCAL) {
1856 rth->dst.input = ip_local_deliver; 2599 rth->dst.input = ip_local_deliver;
2600 rth->rt_spec_dst = fl4->daddr;
2601 }
1857 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { 2602 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2603 rth->rt_spec_dst = fl4->saddr;
1858 if (flags & RTCF_LOCAL && 2604 if (flags & RTCF_LOCAL &&
1859 !(dev_out->flags & IFF_LOOPBACK)) { 2605 !(dev_out->flags & IFF_LOOPBACK)) {
1860 rth->dst.output = ip_mc_output; 2606 rth->dst.output = ip_mc_output;
@@ -1871,31 +2617,37 @@ add:
1871#endif 2617#endif
1872 } 2618 }
1873 2619
1874 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0); 2620 rt_set_nexthop(rth, fl4, res, fi, type, 0);
1875 2621
1876 return rth; 2622 return rth;
1877} 2623}
1878 2624
1879/* 2625/*
1880 * Major route resolver routine. 2626 * Major route resolver routine.
2627 * called with rcu_read_lock();
1881 */ 2628 */
1882 2629
1883struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) 2630static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
1884{ 2631{
1885 struct net_device *dev_out = NULL; 2632 struct net_device *dev_out = NULL;
1886 __u8 tos = RT_FL_TOS(fl4); 2633 __u8 tos = RT_FL_TOS(fl4);
1887 unsigned int flags = 0; 2634 unsigned int flags = 0;
1888 struct fib_result res; 2635 struct fib_result res;
1889 struct rtable *rth; 2636 struct rtable *rth;
2637 __be32 orig_daddr;
2638 __be32 orig_saddr;
1890 int orig_oif; 2639 int orig_oif;
1891 2640
1892 res.tclassid = 0;
1893 res.fi = NULL; 2641 res.fi = NULL;
1894 res.table = NULL; 2642#ifdef CONFIG_IP_MULTIPLE_TABLES
2643 res.r = NULL;
2644#endif
1895 2645
2646 orig_daddr = fl4->daddr;
2647 orig_saddr = fl4->saddr;
1896 orig_oif = fl4->flowi4_oif; 2648 orig_oif = fl4->flowi4_oif;
1897 2649
1898 fl4->flowi4_iif = LOOPBACK_IFINDEX; 2650 fl4->flowi4_iif = net->loopback_dev->ifindex;
1899 fl4->flowi4_tos = tos & IPTOS_RT_MASK; 2651 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1900 fl4->flowi4_scope = ((tos & RTO_ONLINK) ? 2652 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1901 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); 2653 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
@@ -1984,7 +2736,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1984 if (!fl4->daddr) 2736 if (!fl4->daddr)
1985 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); 2737 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
1986 dev_out = net->loopback_dev; 2738 dev_out = net->loopback_dev;
1987 fl4->flowi4_oif = LOOPBACK_IFINDEX; 2739 fl4->flowi4_oif = net->loopback_dev->ifindex;
1988 res.type = RTN_LOCAL; 2740 res.type = RTN_LOCAL;
1989 flags |= RTCF_LOCAL; 2741 flags |= RTCF_LOCAL;
1990 goto make_route; 2742 goto make_route;
@@ -1992,7 +2744,6 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1992 2744
1993 if (fib_lookup(net, fl4, &res)) { 2745 if (fib_lookup(net, fl4, &res)) {
1994 res.fi = NULL; 2746 res.fi = NULL;
1995 res.table = NULL;
1996 if (fl4->flowi4_oif) { 2747 if (fl4->flowi4_oif) {
1997 /* Apparently, routing tables are wrong. Assume, 2748 /* Apparently, routing tables are wrong. Assume,
1998 that the destination is on link. 2749 that the destination is on link.
@@ -2031,6 +2782,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
2031 } 2782 }
2032 dev_out = net->loopback_dev; 2783 dev_out = net->loopback_dev;
2033 fl4->flowi4_oif = dev_out->ifindex; 2784 fl4->flowi4_oif = dev_out->ifindex;
2785 res.fi = NULL;
2034 flags |= RTCF_LOCAL; 2786 flags |= RTCF_LOCAL;
2035 goto make_route; 2787 goto make_route;
2036 } 2788 }
@@ -2053,33 +2805,73 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
2053 2805
2054 2806
2055make_route: 2807make_route:
2056 rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags); 2808 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2809 tos, dev_out, flags);
2810 if (!IS_ERR(rth)) {
2811 unsigned int hash;
2812
2813 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2814 rt_genid(dev_net(dev_out)));
2815 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2816 }
2057 2817
2058out: 2818out:
2059 rcu_read_unlock(); 2819 rcu_read_unlock();
2060 return rth; 2820 return rth;
2061} 2821}
2062EXPORT_SYMBOL_GPL(__ip_route_output_key);
2063 2822
2064static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) 2823struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2065{ 2824{
2066 return NULL; 2825 struct rtable *rth;
2826 unsigned int hash;
2827
2828 if (!rt_caching(net))
2829 goto slow_output;
2830
2831 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2832
2833 rcu_read_lock_bh();
2834 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2835 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2836 if (rth->rt_key_dst == flp4->daddr &&
2837 rth->rt_key_src == flp4->saddr &&
2838 rt_is_output_route(rth) &&
2839 rth->rt_oif == flp4->flowi4_oif &&
2840 rth->rt_mark == flp4->flowi4_mark &&
2841 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2842 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2843 net_eq(dev_net(rth->dst.dev), net) &&
2844 !rt_is_expired(rth)) {
2845 ipv4_validate_peer(rth);
2846 dst_use(&rth->dst, jiffies);
2847 RT_CACHE_STAT_INC(out_hit);
2848 rcu_read_unlock_bh();
2849 if (!flp4->saddr)
2850 flp4->saddr = rth->rt_src;
2851 if (!flp4->daddr)
2852 flp4->daddr = rth->rt_dst;
2853 return rth;
2854 }
2855 RT_CACHE_STAT_INC(out_hlist_search);
2856 }
2857 rcu_read_unlock_bh();
2858
2859slow_output:
2860 return ip_route_output_slow(net, flp4);
2067} 2861}
2862EXPORT_SYMBOL_GPL(__ip_route_output_key);
2068 2863
2069static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst) 2864static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2070{ 2865{
2071 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 2866 return NULL;
2072
2073 return mtu ? : dst->dev->mtu;
2074} 2867}
2075 2868
2076static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 2869static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2077 struct sk_buff *skb, u32 mtu)
2078{ 2870{
2871 return 0;
2079} 2872}
2080 2873
2081static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 2874static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2082 struct sk_buff *skb)
2083{ 2875{
2084} 2876}
2085 2877
@@ -2092,43 +2884,53 @@ static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2092static struct dst_ops ipv4_dst_blackhole_ops = { 2884static struct dst_ops ipv4_dst_blackhole_ops = {
2093 .family = AF_INET, 2885 .family = AF_INET,
2094 .protocol = cpu_to_be16(ETH_P_IP), 2886 .protocol = cpu_to_be16(ETH_P_IP),
2887 .destroy = ipv4_dst_destroy,
2095 .check = ipv4_blackhole_dst_check, 2888 .check = ipv4_blackhole_dst_check,
2096 .mtu = ipv4_blackhole_mtu, 2889 .default_mtu = ipv4_blackhole_default_mtu,
2097 .default_advmss = ipv4_default_advmss, 2890 .default_advmss = ipv4_default_advmss,
2098 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2891 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2099 .redirect = ipv4_rt_blackhole_redirect,
2100 .cow_metrics = ipv4_rt_blackhole_cow_metrics, 2892 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2101 .neigh_lookup = ipv4_neigh_lookup, 2893 .neigh_lookup = ipv4_neigh_lookup,
2102}; 2894};
2103 2895
2104struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2896struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2105{ 2897{
2898 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2106 struct rtable *ort = (struct rtable *) dst_orig; 2899 struct rtable *ort = (struct rtable *) dst_orig;
2107 struct rtable *rt;
2108 2900
2109 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2110 if (rt) { 2901 if (rt) {
2111 struct dst_entry *new = &rt->dst; 2902 struct dst_entry *new = &rt->dst;
2112 2903
2113 new->__use = 1; 2904 new->__use = 1;
2114 new->input = dst_discard; 2905 new->input = dst_discard;
2115 new->output = dst_discard; 2906 new->output = dst_discard;
2907 dst_copy_metrics(new, &ort->dst);
2116 2908
2117 new->dev = ort->dst.dev; 2909 new->dev = ort->dst.dev;
2118 if (new->dev) 2910 if (new->dev)
2119 dev_hold(new->dev); 2911 dev_hold(new->dev);
2120 2912
2121 rt->rt_is_input = ort->rt_is_input; 2913 rt->rt_key_dst = ort->rt_key_dst;
2914 rt->rt_key_src = ort->rt_key_src;
2915 rt->rt_key_tos = ort->rt_key_tos;
2916 rt->rt_route_iif = ort->rt_route_iif;
2122 rt->rt_iif = ort->rt_iif; 2917 rt->rt_iif = ort->rt_iif;
2123 rt->rt_pmtu = ort->rt_pmtu; 2918 rt->rt_oif = ort->rt_oif;
2919 rt->rt_mark = ort->rt_mark;
2124 2920
2125 rt->rt_genid = rt_genid(net); 2921 rt->rt_genid = rt_genid(net);
2126 rt->rt_flags = ort->rt_flags; 2922 rt->rt_flags = ort->rt_flags;
2127 rt->rt_type = ort->rt_type; 2923 rt->rt_type = ort->rt_type;
2924 rt->rt_dst = ort->rt_dst;
2925 rt->rt_src = ort->rt_src;
2128 rt->rt_gateway = ort->rt_gateway; 2926 rt->rt_gateway = ort->rt_gateway;
2129 rt->rt_uses_gateway = ort->rt_uses_gateway; 2927 rt->rt_spec_dst = ort->rt_spec_dst;
2130 2928 rt->peer = ort->peer;
2131 INIT_LIST_HEAD(&rt->rt_uncached); 2929 if (rt->peer)
2930 atomic_inc(&rt->peer->refcnt);
2931 rt->fi = ort->fi;
2932 if (rt->fi)
2933 atomic_inc(&rt->fi->fib_clntref);
2132 2934
2133 dst_free(new); 2935 dst_free(new);
2134 } 2936 }
@@ -2155,18 +2957,18 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2155} 2957}
2156EXPORT_SYMBOL_GPL(ip_route_output_flow); 2958EXPORT_SYMBOL_GPL(ip_route_output_flow);
2157 2959
2158static int rt_fill_info(struct net *net, __be32 dst, __be32 src, 2960static int rt_fill_info(struct net *net,
2159 struct flowi4 *fl4, struct sk_buff *skb, u32 portid, 2961 struct sk_buff *skb, u32 pid, u32 seq, int event,
2160 u32 seq, int event, int nowait, unsigned int flags) 2962 int nowait, unsigned int flags)
2161{ 2963{
2162 struct rtable *rt = skb_rtable(skb); 2964 struct rtable *rt = skb_rtable(skb);
2163 struct rtmsg *r; 2965 struct rtmsg *r;
2164 struct nlmsghdr *nlh; 2966 struct nlmsghdr *nlh;
2165 unsigned long expires = 0; 2967 long expires = 0;
2166 u32 error; 2968 const struct inet_peer *peer = rt->peer;
2167 u32 metrics[RTAX_MAX]; 2969 u32 id = 0, ts = 0, tsage = 0, error;
2168 2970
2169 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags); 2971 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2170 if (nlh == NULL) 2972 if (nlh == NULL)
2171 return -EMSGSIZE; 2973 return -EMSGSIZE;
2172 2974
@@ -2174,10 +2976,9 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2174 r->rtm_family = AF_INET; 2976 r->rtm_family = AF_INET;
2175 r->rtm_dst_len = 32; 2977 r->rtm_dst_len = 32;
2176 r->rtm_src_len = 0; 2978 r->rtm_src_len = 0;
2177 r->rtm_tos = fl4->flowi4_tos; 2979 r->rtm_tos = rt->rt_key_tos;
2178 r->rtm_table = RT_TABLE_MAIN; 2980 r->rtm_table = RT_TABLE_MAIN;
2179 if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN)) 2981 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2180 goto nla_put_failure;
2181 r->rtm_type = rt->rt_type; 2982 r->rtm_type = rt->rt_type;
2182 r->rtm_scope = RT_SCOPE_UNIVERSE; 2983 r->rtm_scope = RT_SCOPE_UNIVERSE;
2183 r->rtm_protocol = RTPROT_UNSPEC; 2984 r->rtm_protocol = RTPROT_UNSPEC;
@@ -2185,58 +2986,53 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2185 if (rt->rt_flags & RTCF_NOTIFY) 2986 if (rt->rt_flags & RTCF_NOTIFY)
2186 r->rtm_flags |= RTM_F_NOTIFY; 2987 r->rtm_flags |= RTM_F_NOTIFY;
2187 2988
2188 if (nla_put_be32(skb, RTA_DST, dst)) 2989 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2189 goto nla_put_failure; 2990
2190 if (src) { 2991 if (rt->rt_key_src) {
2191 r->rtm_src_len = 32; 2992 r->rtm_src_len = 32;
2192 if (nla_put_be32(skb, RTA_SRC, src)) 2993 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2193 goto nla_put_failure;
2194 } 2994 }
2195 if (rt->dst.dev && 2995 if (rt->dst.dev)
2196 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) 2996 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2197 goto nla_put_failure;
2198#ifdef CONFIG_IP_ROUTE_CLASSID 2997#ifdef CONFIG_IP_ROUTE_CLASSID
2199 if (rt->dst.tclassid && 2998 if (rt->dst.tclassid)
2200 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) 2999 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2201 goto nla_put_failure;
2202#endif 3000#endif
2203 if (!rt_is_input_route(rt) && 3001 if (rt_is_input_route(rt))
2204 fl4->saddr != src) { 3002 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2205 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr)) 3003 else if (rt->rt_src != rt->rt_key_src)
2206 goto nla_put_failure; 3004 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2207 }
2208 if (rt->rt_uses_gateway &&
2209 nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2210 goto nla_put_failure;
2211
2212 expires = rt->dst.expires;
2213 if (expires) {
2214 unsigned long now = jiffies;
2215 3005
2216 if (time_before(now, expires)) 3006 if (rt->rt_dst != rt->rt_gateway)
2217 expires -= now; 3007 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2218 else
2219 expires = 0;
2220 }
2221 3008
2222 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); 3009 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2223 if (rt->rt_pmtu && expires)
2224 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2225 if (rtnetlink_put_metrics(skb, metrics) < 0)
2226 goto nla_put_failure; 3010 goto nla_put_failure;
2227 3011
2228 if (fl4->flowi4_mark && 3012 if (rt->rt_mark)
2229 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) 3013 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2230 goto nla_put_failure;
2231 3014
2232 error = rt->dst.error; 3015 error = rt->dst.error;
3016 if (peer) {
3017 inet_peer_refcheck(rt->peer);
3018 id = atomic_read(&peer->ip_id_count) & 0xffff;
3019 if (peer->tcp_ts_stamp) {
3020 ts = peer->tcp_ts;
3021 tsage = get_seconds() - peer->tcp_ts_stamp;
3022 }
3023 expires = ACCESS_ONCE(peer->pmtu_expires);
3024 if (expires)
3025 expires -= jiffies;
3026 }
2233 3027
2234 if (rt_is_input_route(rt)) { 3028 if (rt_is_input_route(rt)) {
2235#ifdef CONFIG_IP_MROUTE 3029#ifdef CONFIG_IP_MROUTE
3030 __be32 dst = rt->rt_dst;
3031
2236 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && 3032 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2237 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { 3033 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2238 int err = ipmr_get_route(net, skb, 3034 int err = ipmr_get_route(net, skb,
2239 fl4->saddr, fl4->daddr, 3035 rt->rt_src, rt->rt_dst,
2240 r, nowait); 3036 r, nowait);
2241 if (err <= 0) { 3037 if (err <= 0) {
2242 if (!nowait) { 3038 if (!nowait) {
@@ -2251,11 +3047,11 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2251 } 3047 }
2252 } else 3048 } else
2253#endif 3049#endif
2254 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif)) 3050 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2255 goto nla_put_failure;
2256 } 3051 }
2257 3052
2258 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) 3053 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3054 expires, error) < 0)
2259 goto nla_put_failure; 3055 goto nla_put_failure;
2260 3056
2261 return nlmsg_end(skb, nlh); 3057 return nlmsg_end(skb, nlh);
@@ -2265,13 +3061,12 @@ nla_put_failure:
2265 return -EMSGSIZE; 3061 return -EMSGSIZE;
2266} 3062}
2267 3063
2268static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg) 3064static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2269{ 3065{
2270 struct net *net = sock_net(in_skb->sk); 3066 struct net *net = sock_net(in_skb->sk);
2271 struct rtmsg *rtm; 3067 struct rtmsg *rtm;
2272 struct nlattr *tb[RTA_MAX+1]; 3068 struct nlattr *tb[RTA_MAX+1];
2273 struct rtable *rt = NULL; 3069 struct rtable *rt = NULL;
2274 struct flowi4 fl4;
2275 __be32 dst = 0; 3070 __be32 dst = 0;
2276 __be32 src = 0; 3071 __be32 src = 0;
2277 u32 iif; 3072 u32 iif;
@@ -2306,13 +3101,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
2306 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; 3101 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2307 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; 3102 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2308 3103
2309 memset(&fl4, 0, sizeof(fl4));
2310 fl4.daddr = dst;
2311 fl4.saddr = src;
2312 fl4.flowi4_tos = rtm->rtm_tos;
2313 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2314 fl4.flowi4_mark = mark;
2315
2316 if (iif) { 3104 if (iif) {
2317 struct net_device *dev; 3105 struct net_device *dev;
2318 3106
@@ -2333,6 +3121,13 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
2333 if (err == 0 && rt->dst.error) 3121 if (err == 0 && rt->dst.error)
2334 err = -rt->dst.error; 3122 err = -rt->dst.error;
2335 } else { 3123 } else {
3124 struct flowi4 fl4 = {
3125 .daddr = dst,
3126 .saddr = src,
3127 .flowi4_tos = rtm->rtm_tos,
3128 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3129 .flowi4_mark = mark,
3130 };
2336 rt = ip_route_output_key(net, &fl4); 3131 rt = ip_route_output_key(net, &fl4);
2337 3132
2338 err = 0; 3133 err = 0;
@@ -2347,13 +3142,12 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
2347 if (rtm->rtm_flags & RTM_F_NOTIFY) 3142 if (rtm->rtm_flags & RTM_F_NOTIFY)
2348 rt->rt_flags |= RTCF_NOTIFY; 3143 rt->rt_flags |= RTCF_NOTIFY;
2349 3144
2350 err = rt_fill_info(net, dst, src, &fl4, skb, 3145 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2351 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2352 RTM_NEWROUTE, 0, 0); 3146 RTM_NEWROUTE, 0, 0);
2353 if (err <= 0) 3147 if (err <= 0)
2354 goto errout_free; 3148 goto errout_free;
2355 3149
2356 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 3150 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2357errout: 3151errout:
2358 return err; 3152 return err;
2359 3153
@@ -2364,12 +3158,49 @@ errout_free:
2364 3158
2365int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) 3159int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2366{ 3160{
3161 struct rtable *rt;
3162 int h, s_h;
3163 int idx, s_idx;
3164 struct net *net;
3165
3166 net = sock_net(skb->sk);
3167
3168 s_h = cb->args[0];
3169 if (s_h < 0)
3170 s_h = 0;
3171 s_idx = idx = cb->args[1];
3172 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3173 if (!rt_hash_table[h].chain)
3174 continue;
3175 rcu_read_lock_bh();
3176 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3177 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3178 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3179 continue;
3180 if (rt_is_expired(rt))
3181 continue;
3182 skb_dst_set_noref(skb, &rt->dst);
3183 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3184 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3185 1, NLM_F_MULTI) <= 0) {
3186 skb_dst_drop(skb);
3187 rcu_read_unlock_bh();
3188 goto done;
3189 }
3190 skb_dst_drop(skb);
3191 }
3192 rcu_read_unlock_bh();
3193 }
3194
3195done:
3196 cb->args[0] = h;
3197 cb->args[1] = idx;
2367 return skb->len; 3198 return skb->len;
2368} 3199}
2369 3200
2370void ip_rt_multicast_event(struct in_device *in_dev) 3201void ip_rt_multicast_event(struct in_device *in_dev)
2371{ 3202{
2372 rt_cache_flush(dev_net(in_dev->dev)); 3203 rt_cache_flush(dev_net(in_dev->dev), 0);
2373} 3204}
2374 3205
2375#ifdef CONFIG_SYSCTL 3206#ifdef CONFIG_SYSCTL
@@ -2378,7 +3209,16 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2378 size_t *lenp, loff_t *ppos) 3209 size_t *lenp, loff_t *ppos)
2379{ 3210{
2380 if (write) { 3211 if (write) {
2381 rt_cache_flush((struct net *)__ctl->extra1); 3212 int flush_delay;
3213 ctl_table ctl;
3214 struct net *net;
3215
3216 memcpy(&ctl, __ctl, sizeof(ctl));
3217 ctl.data = &flush_delay;
3218 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3219
3220 net = (struct net *)__ctl->extra1;
3221 rt_cache_flush(net, flush_delay);
2382 return 0; 3222 return 0;
2383 } 3223 }
2384 3224
@@ -2431,6 +3271,13 @@ static ctl_table ipv4_route_table[] = {
2431 .proc_handler = proc_dointvec_jiffies, 3271 .proc_handler = proc_dointvec_jiffies,
2432 }, 3272 },
2433 { 3273 {
3274 .procname = "gc_interval",
3275 .data = &ip_rt_gc_interval,
3276 .maxlen = sizeof(int),
3277 .mode = 0644,
3278 .proc_handler = proc_dointvec_jiffies,
3279 },
3280 {
2434 .procname = "redirect_load", 3281 .procname = "redirect_load",
2435 .data = &ip_rt_redirect_load, 3282 .data = &ip_rt_redirect_load,
2436 .maxlen = sizeof(int), 3283 .maxlen = sizeof(int),
@@ -2496,6 +3343,23 @@ static ctl_table ipv4_route_table[] = {
2496 { } 3343 { }
2497}; 3344};
2498 3345
3346static struct ctl_table empty[1];
3347
3348static struct ctl_table ipv4_skeleton[] =
3349{
3350 { .procname = "route",
3351 .mode = 0555, .child = ipv4_route_table},
3352 { .procname = "neigh",
3353 .mode = 0555, .child = empty},
3354 { }
3355};
3356
3357static __net_initdata struct ctl_path ipv4_path[] = {
3358 { .procname = "net", },
3359 { .procname = "ipv4", },
3360 { },
3361};
3362
2499static struct ctl_table ipv4_route_flush_table[] = { 3363static struct ctl_table ipv4_route_flush_table[] = {
2500 { 3364 {
2501 .procname = "flush", 3365 .procname = "flush",
@@ -2506,6 +3370,13 @@ static struct ctl_table ipv4_route_flush_table[] = {
2506 { }, 3370 { },
2507}; 3371};
2508 3372
3373static __net_initdata struct ctl_path ipv4_route_path[] = {
3374 { .procname = "net", },
3375 { .procname = "ipv4", },
3376 { .procname = "route", },
3377 { },
3378};
3379
2509static __net_init int sysctl_route_net_init(struct net *net) 3380static __net_init int sysctl_route_net_init(struct net *net)
2510{ 3381{
2511 struct ctl_table *tbl; 3382 struct ctl_table *tbl;
@@ -2515,14 +3386,11 @@ static __net_init int sysctl_route_net_init(struct net *net)
2515 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); 3386 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2516 if (tbl == NULL) 3387 if (tbl == NULL)
2517 goto err_dup; 3388 goto err_dup;
2518
2519 /* Don't export sysctls to unprivileged users */
2520 if (net->user_ns != &init_user_ns)
2521 tbl[0].procname = NULL;
2522 } 3389 }
2523 tbl[0].extra1 = net; 3390 tbl[0].extra1 = net;
2524 3391
2525 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl); 3392 net->ipv4.route_hdr =
3393 register_net_sysctl_table(net, ipv4_route_path, tbl);
2526 if (net->ipv4.route_hdr == NULL) 3394 if (net->ipv4.route_hdr == NULL)
2527 goto err_reg; 3395 goto err_reg;
2528 return 0; 3396 return 0;
@@ -2552,7 +3420,8 @@ static __net_initdata struct pernet_operations sysctl_route_ops = {
2552 3420
2553static __net_init int rt_genid_init(struct net *net) 3421static __net_init int rt_genid_init(struct net *net)
2554{ 3422{
2555 atomic_set(&net->rt_genid, 0); 3423 get_random_bytes(&net->ipv4.rt_genid,
3424 sizeof(net->ipv4.rt_genid));
2556 get_random_bytes(&net->ipv4.dev_addr_genid, 3425 get_random_bytes(&net->ipv4.dev_addr_genid,
2557 sizeof(net->ipv4.dev_addr_genid)); 3426 sizeof(net->ipv4.dev_addr_genid));
2558 return 0; 3427 return 0;
@@ -2562,35 +3431,21 @@ static __net_initdata struct pernet_operations rt_genid_ops = {
2562 .init = rt_genid_init, 3431 .init = rt_genid_init,
2563}; 3432};
2564 3433
2565static int __net_init ipv4_inetpeer_init(struct net *net)
2566{
2567 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2568
2569 if (!bp)
2570 return -ENOMEM;
2571 inet_peer_base_init(bp);
2572 net->ipv4.peers = bp;
2573 return 0;
2574}
2575
2576static void __net_exit ipv4_inetpeer_exit(struct net *net)
2577{
2578 struct inet_peer_base *bp = net->ipv4.peers;
2579
2580 net->ipv4.peers = NULL;
2581 inetpeer_invalidate_tree(bp);
2582 kfree(bp);
2583}
2584
2585static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2586 .init = ipv4_inetpeer_init,
2587 .exit = ipv4_inetpeer_exit,
2588};
2589 3434
2590#ifdef CONFIG_IP_ROUTE_CLASSID 3435#ifdef CONFIG_IP_ROUTE_CLASSID
2591struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; 3436struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2592#endif /* CONFIG_IP_ROUTE_CLASSID */ 3437#endif /* CONFIG_IP_ROUTE_CLASSID */
2593 3438
3439static __initdata unsigned long rhash_entries;
3440static int __init set_rhash_entries(char *str)
3441{
3442 if (!str)
3443 return 0;
3444 rhash_entries = simple_strtoul(str, &str, 0);
3445 return 1;
3446}
3447__setup("rhash_entries=", set_rhash_entries);
3448
2594int __init ip_rt_init(void) 3449int __init ip_rt_init(void)
2595{ 3450{
2596 int rc = 0; 3451 int rc = 0;
@@ -2613,17 +3468,35 @@ int __init ip_rt_init(void)
2613 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) 3468 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2614 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); 3469 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2615 3470
2616 ipv4_dst_ops.gc_thresh = ~0; 3471 rt_hash_table = (struct rt_hash_bucket *)
2617 ip_rt_max_size = INT_MAX; 3472 alloc_large_system_hash("IP route cache",
3473 sizeof(struct rt_hash_bucket),
3474 rhash_entries,
3475 (totalram_pages >= 128 * 1024) ?
3476 15 : 17,
3477 0,
3478 &rt_hash_log,
3479 &rt_hash_mask,
3480 rhash_entries ? 0 : 512 * 1024);
3481 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3482 rt_hash_lock_init();
3483
3484 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3485 ip_rt_max_size = (rt_hash_mask + 1) * 16;
2618 3486
2619 devinet_init(); 3487 devinet_init();
2620 ip_fib_init(); 3488 ip_fib_init();
2621 3489
3490 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3491 expires_ljiffies = jiffies;
3492 schedule_delayed_work(&expires_work,
3493 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3494
2622 if (ip_rt_proc_init()) 3495 if (ip_rt_proc_init())
2623 pr_err("Unable to create route proc files\n"); 3496 printk(KERN_ERR "Unable to create route proc files\n");
2624#ifdef CONFIG_XFRM 3497#ifdef CONFIG_XFRM
2625 xfrm_init(); 3498 xfrm_init();
2626 xfrm4_init(); 3499 xfrm4_init(ip_rt_max_size);
2627#endif 3500#endif
2628 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL); 3501 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2629 3502
@@ -2631,7 +3504,6 @@ int __init ip_rt_init(void)
2631 register_pernet_subsys(&sysctl_route_ops); 3504 register_pernet_subsys(&sysctl_route_ops);
2632#endif 3505#endif
2633 register_pernet_subsys(&rt_genid_ops); 3506 register_pernet_subsys(&rt_genid_ops);
2634 register_pernet_subsys(&ipv4_inetpeer_ops);
2635 return rc; 3507 return rc;
2636} 3508}
2637 3509
@@ -2642,6 +3514,6 @@ int __init ip_rt_init(void)
2642 */ 3514 */
2643void __init ip_static_sysctl_init(void) 3515void __init ip_static_sysctl_init(void)
2644{ 3516{
2645 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table); 3517 register_sysctl_paths(ipv4_path, ipv4_skeleton);
2646} 3518}
2647#endif 3519#endif
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index b236ef04914..3bc5c8f7c71 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -15,7 +15,6 @@
15#include <linux/random.h> 15#include <linux/random.h>
16#include <linux/cryptohash.h> 16#include <linux/cryptohash.h>
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/export.h>
19#include <net/tcp.h> 18#include <net/tcp.h>
20#include <net/route.h> 19#include <net/route.h>
21 20
@@ -245,7 +244,7 @@ bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, bool *ecn_ok)
245 if (!sysctl_tcp_timestamps) 244 if (!sysctl_tcp_timestamps)
246 return false; 245 return false;
247 246
248 tcp_opt->sack_ok = (options & (1 << 4)) ? TCP_SACK_SEEN : 0; 247 tcp_opt->sack_ok = (options >> 4) & 0x1;
249 *ecn_ok = (options >> 5) & 1; 248 *ecn_ok = (options >> 5) & 1;
250 if (*ecn_ok && !sysctl_tcp_ecn) 249 if (*ecn_ok && !sysctl_tcp_ecn)
251 return false; 250 return false;
@@ -266,7 +265,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
266 struct ip_options *opt) 265 struct ip_options *opt)
267{ 266{
268 struct tcp_options_received tcp_opt; 267 struct tcp_options_received tcp_opt;
269 const u8 *hash_location; 268 u8 *hash_location;
270 struct inet_request_sock *ireq; 269 struct inet_request_sock *ireq;
271 struct tcp_request_sock *treq; 270 struct tcp_request_sock *treq;
272 struct tcp_sock *tp = tcp_sk(sk); 271 struct tcp_sock *tp = tcp_sk(sk);
@@ -278,7 +277,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
278 struct rtable *rt; 277 struct rtable *rt;
279 __u8 rcv_wscale; 278 __u8 rcv_wscale;
280 bool ecn_ok = false; 279 bool ecn_ok = false;
281 struct flowi4 fl4;
282 280
283 if (!sysctl_tcp_syncookies || !th->ack || th->rst) 281 if (!sysctl_tcp_syncookies || !th->ack || th->rst)
284 goto out; 282 goto out;
@@ -293,7 +291,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
293 291
294 /* check for timestamp cookie support */ 292 /* check for timestamp cookie support */
295 memset(&tcp_opt, 0, sizeof(tcp_opt)); 293 memset(&tcp_opt, 0, sizeof(tcp_opt));
296 tcp_parse_options(skb, &tcp_opt, &hash_location, 0, NULL); 294 tcp_parse_options(skb, &tcp_opt, &hash_location, 0);
297 295
298 if (!cookie_check_timestamp(&tcp_opt, &ecn_ok)) 296 if (!cookie_check_timestamp(&tcp_opt, &ecn_ok))
299 goto out; 297 goto out;
@@ -319,7 +317,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
319 ireq->tstamp_ok = tcp_opt.saw_tstamp; 317 ireq->tstamp_ok = tcp_opt.saw_tstamp;
320 req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; 318 req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
321 treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; 319 treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
322 treq->listener = NULL;
323 320
324 /* We throwed the options of the initial SYN away, so we hope 321 /* We throwed the options of the initial SYN away, so we hope
325 * the ACK carries the same options again (see RFC1122 4.2.3.8) 322 * the ACK carries the same options again (see RFC1122 4.2.3.8)
@@ -340,7 +337,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
340 } 337 }
341 338
342 req->expires = 0UL; 339 req->expires = 0UL;
343 req->num_retrans = 0; 340 req->retrans = 0;
344 341
345 /* 342 /*
346 * We need to lookup the route here to get at the correct 343 * We need to lookup the route here to get at the correct
@@ -348,16 +345,20 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
348 * hasn't changed since we received the original syn, but I see 345 * hasn't changed since we received the original syn, but I see
349 * no easy way to do this. 346 * no easy way to do this.
350 */ 347 */
351 flowi4_init_output(&fl4, 0, sk->sk_mark, RT_CONN_FLAGS(sk), 348 {
352 RT_SCOPE_UNIVERSE, IPPROTO_TCP, 349 struct flowi4 fl4;
353 inet_sk_flowi_flags(sk), 350
354 (opt && opt->srr) ? opt->faddr : ireq->rmt_addr, 351 flowi4_init_output(&fl4, 0, sk->sk_mark, RT_CONN_FLAGS(sk),
355 ireq->loc_addr, th->source, th->dest); 352 RT_SCOPE_UNIVERSE, IPPROTO_TCP,
356 security_req_classify_flow(req, flowi4_to_flowi(&fl4)); 353 inet_sk_flowi_flags(sk),
357 rt = ip_route_output_key(sock_net(sk), &fl4); 354 (opt && opt->srr) ? opt->faddr : ireq->rmt_addr,
358 if (IS_ERR(rt)) { 355 ireq->loc_addr, th->source, th->dest);
359 reqsk_free(req); 356 security_req_classify_flow(req, flowi4_to_flowi(&fl4));
360 goto out; 357 rt = ip_route_output_key(sock_net(sk), &fl4);
358 if (IS_ERR(rt)) {
359 reqsk_free(req);
360 goto out;
361 }
361 } 362 }
362 363
363 /* Try to redo what tcp_v4_send_synack did. */ 364 /* Try to redo what tcp_v4_send_synack did. */
@@ -371,10 +372,5 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
371 ireq->rcv_wscale = rcv_wscale; 372 ireq->rcv_wscale = rcv_wscale;
372 373
373 ret = get_cookie_sock(sk, skb, req, &rt->dst); 374 ret = get_cookie_sock(sk, skb, req, &rt->dst);
374 /* ip_queue_xmit() depends on our flow being setup
375 * Normal sockets get it right from inet_csk_route_child_sock()
376 */
377 if (ret)
378 inet_sk(ret)->cork.fl.u.ip4 = fl4;
379out: return ret; 375out: return ret;
380} 376}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d84400b6504..69fd7201129 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -14,7 +14,6 @@
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/nsproxy.h> 16#include <linux/nsproxy.h>
17#include <linux/swap.h>
18#include <net/snmp.h> 17#include <net/snmp.h>
19#include <net/icmp.h> 18#include <net/icmp.h>
20#include <net/ip.h> 19#include <net/ip.h>
@@ -24,10 +23,8 @@
24#include <net/cipso_ipv4.h> 23#include <net/cipso_ipv4.h>
25#include <net/inet_frag.h> 24#include <net/inet_frag.h>
26#include <net/ping.h> 25#include <net/ping.h>
27#include <net/tcp_memcontrol.h>
28 26
29static int zero; 27static int zero;
30static int two = 2;
31static int tcp_retr1_max = 255; 28static int tcp_retr1_max = 255;
32static int ip_local_port_range_min[] = { 1, 1 }; 29static int ip_local_port_range_min[] = { 1, 1 };
33static int ip_local_port_range_max[] = { 65535, 65535 }; 30static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -76,10 +73,10 @@ static int ipv4_local_port_range(ctl_table *table, int write,
76} 73}
77 74
78 75
79static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high) 76void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low, gid_t *high)
80{ 77{
81 kgid_t *data = table->data; 78 gid_t *data = table->data;
82 unsigned int seq; 79 unsigned seq;
83 do { 80 do {
84 seq = read_seqbegin(&sysctl_local_ports.lock); 81 seq = read_seqbegin(&sysctl_local_ports.lock);
85 82
@@ -89,12 +86,12 @@ static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low
89} 86}
90 87
91/* Update system visible IP port range */ 88/* Update system visible IP port range */
92static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t high) 89static void set_ping_group_range(struct ctl_table *table, int range[2])
93{ 90{
94 kgid_t *data = table->data; 91 gid_t *data = table->data;
95 write_seqlock(&sysctl_local_ports.lock); 92 write_seqlock(&sysctl_local_ports.lock);
96 data[0] = low; 93 data[0] = range[0];
97 data[1] = high; 94 data[1] = range[1];
98 write_sequnlock(&sysctl_local_ports.lock); 95 write_sequnlock(&sysctl_local_ports.lock);
99} 96}
100 97
@@ -103,33 +100,21 @@ static int ipv4_ping_group_range(ctl_table *table, int write,
103 void __user *buffer, 100 void __user *buffer,
104 size_t *lenp, loff_t *ppos) 101 size_t *lenp, loff_t *ppos)
105{ 102{
106 struct user_namespace *user_ns = current_user_ns();
107 int ret; 103 int ret;
108 gid_t urange[2]; 104 gid_t range[2];
109 kgid_t low, high;
110 ctl_table tmp = { 105 ctl_table tmp = {
111 .data = &urange, 106 .data = &range,
112 .maxlen = sizeof(urange), 107 .maxlen = sizeof(range),
113 .mode = table->mode, 108 .mode = table->mode,
114 .extra1 = &ip_ping_group_range_min, 109 .extra1 = &ip_ping_group_range_min,
115 .extra2 = &ip_ping_group_range_max, 110 .extra2 = &ip_ping_group_range_max,
116 }; 111 };
117 112
118 inet_get_ping_group_range_table(table, &low, &high); 113 inet_get_ping_group_range_table(table, range, range + 1);
119 urange[0] = from_kgid_munged(user_ns, low);
120 urange[1] = from_kgid_munged(user_ns, high);
121 ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); 114 ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
122 115
123 if (write && ret == 0) { 116 if (write && ret == 0)
124 low = make_kgid(user_ns, urange[0]); 117 set_ping_group_range(table, range);
125 high = make_kgid(user_ns, urange[1]);
126 if (!gid_valid(low) || !gid_valid(high) ||
127 (urange[1] < urange[0]) || gid_lt(high, low)) {
128 low = make_kgid(&init_user_ns, 1);
129 high = make_kgid(&init_user_ns, 0);
130 }
131 set_ping_group_range(table, low, high);
132 }
133 118
134 return ret; 119 return ret;
135} 120}
@@ -189,90 +174,6 @@ static int proc_allowed_congestion_control(ctl_table *ctl,
189 return ret; 174 return ret;
190} 175}
191 176
192static int ipv4_tcp_mem(ctl_table *ctl, int write,
193 void __user *buffer, size_t *lenp,
194 loff_t *ppos)
195{
196 int ret;
197 unsigned long vec[3];
198 struct net *net = current->nsproxy->net_ns;
199#ifdef CONFIG_MEMCG_KMEM
200 struct mem_cgroup *memcg;
201#endif
202
203 ctl_table tmp = {
204 .data = &vec,
205 .maxlen = sizeof(vec),
206 .mode = ctl->mode,
207 };
208
209 if (!write) {
210 ctl->data = &net->ipv4.sysctl_tcp_mem;
211 return proc_doulongvec_minmax(ctl, write, buffer, lenp, ppos);
212 }
213
214 ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos);
215 if (ret)
216 return ret;
217
218#ifdef CONFIG_MEMCG_KMEM
219 rcu_read_lock();
220 memcg = mem_cgroup_from_task(current);
221
222 tcp_prot_mem(memcg, vec[0], 0);
223 tcp_prot_mem(memcg, vec[1], 1);
224 tcp_prot_mem(memcg, vec[2], 2);
225 rcu_read_unlock();
226#endif
227
228 net->ipv4.sysctl_tcp_mem[0] = vec[0];
229 net->ipv4.sysctl_tcp_mem[1] = vec[1];
230 net->ipv4.sysctl_tcp_mem[2] = vec[2];
231
232 return 0;
233}
234
235int proc_tcp_fastopen_key(ctl_table *ctl, int write, void __user *buffer,
236 size_t *lenp, loff_t *ppos)
237{
238 ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) };
239 struct tcp_fastopen_context *ctxt;
240 int ret;
241 u32 user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */
242
243 tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL);
244 if (!tbl.data)
245 return -ENOMEM;
246
247 rcu_read_lock();
248 ctxt = rcu_dereference(tcp_fastopen_ctx);
249 if (ctxt)
250 memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH);
251 else
252 memset(user_key, 0, sizeof(user_key));
253 rcu_read_unlock();
254
255 snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x",
256 user_key[0], user_key[1], user_key[2], user_key[3]);
257 ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
258
259 if (write && ret == 0) {
260 if (sscanf(tbl.data, "%x-%x-%x-%x", user_key, user_key + 1,
261 user_key + 2, user_key + 3) != 4) {
262 ret = -EINVAL;
263 goto bad_key;
264 }
265 tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH);
266 }
267
268bad_key:
269 pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n",
270 user_key[0], user_key[1], user_key[2], user_key[3],
271 (char *)tbl.data, ret);
272 kfree(tbl.data);
273 return ret;
274}
275
276static struct ctl_table ipv4_table[] = { 177static struct ctl_table ipv4_table[] = {
277 { 178 {
278 .procname = "tcp_timestamps", 179 .procname = "tcp_timestamps",
@@ -354,13 +255,6 @@ static struct ctl_table ipv4_table[] = {
354 .proc_handler = proc_dointvec 255 .proc_handler = proc_dointvec
355 }, 256 },
356 { 257 {
357 .procname = "ip_early_demux",
358 .data = &sysctl_ip_early_demux,
359 .maxlen = sizeof(int),
360 .mode = 0644,
361 .proc_handler = proc_dointvec
362 },
363 {
364 .procname = "ip_dynaddr", 258 .procname = "ip_dynaddr",
365 .data = &sysctl_ip_dynaddr, 259 .data = &sysctl_ip_dynaddr,
366 .maxlen = sizeof(int), 260 .maxlen = sizeof(int),
@@ -420,19 +314,6 @@ static struct ctl_table ipv4_table[] = {
420 }, 314 },
421#endif 315#endif
422 { 316 {
423 .procname = "tcp_fastopen",
424 .data = &sysctl_tcp_fastopen,
425 .maxlen = sizeof(int),
426 .mode = 0644,
427 .proc_handler = proc_dointvec,
428 },
429 {
430 .procname = "tcp_fastopen_key",
431 .mode = 0600,
432 .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10),
433 .proc_handler = proc_tcp_fastopen_key,
434 },
435 {
436 .procname = "tcp_tw_recycle", 317 .procname = "tcp_tw_recycle",
437 .data = &tcp_death_row.sysctl_tw_recycle, 318 .data = &tcp_death_row.sysctl_tw_recycle,
438 .maxlen = sizeof(int), 319 .maxlen = sizeof(int),
@@ -552,6 +433,13 @@ static struct ctl_table ipv4_table[] = {
552 .proc_handler = proc_dointvec 433 .proc_handler = proc_dointvec
553 }, 434 },
554 { 435 {
436 .procname = "tcp_mem",
437 .data = &sysctl_tcp_mem,
438 .maxlen = sizeof(sysctl_tcp_mem),
439 .mode = 0644,
440 .proc_handler = proc_doulongvec_minmax
441 },
442 {
555 .procname = "tcp_wmem", 443 .procname = "tcp_wmem",
556 .data = &sysctl_tcp_wmem, 444 .data = &sysctl_tcp_wmem,
557 .maxlen = sizeof(sysctl_tcp_wmem), 445 .maxlen = sizeof(sysctl_tcp_wmem),
@@ -664,20 +552,6 @@ static struct ctl_table ipv4_table[] = {
664 .mode = 0644, 552 .mode = 0644,
665 .proc_handler = proc_dointvec 553 .proc_handler = proc_dointvec
666 }, 554 },
667 {
668 .procname = "tcp_limit_output_bytes",
669 .data = &sysctl_tcp_limit_output_bytes,
670 .maxlen = sizeof(int),
671 .mode = 0644,
672 .proc_handler = proc_dointvec
673 },
674 {
675 .procname = "tcp_challenge_ack_limit",
676 .data = &sysctl_tcp_challenge_ack_limit,
677 .maxlen = sizeof(int),
678 .mode = 0644,
679 .proc_handler = proc_dointvec
680 },
681#ifdef CONFIG_NET_DMA 555#ifdef CONFIG_NET_DMA
682 { 556 {
683 .procname = "tcp_dma_copybreak", 557 .procname = "tcp_dma_copybreak",
@@ -765,15 +639,6 @@ static struct ctl_table ipv4_table[] = {
765 .proc_handler = proc_dointvec 639 .proc_handler = proc_dointvec
766 }, 640 },
767 { 641 {
768 .procname = "tcp_early_retrans",
769 .data = &sysctl_tcp_early_retrans,
770 .maxlen = sizeof(int),
771 .mode = 0644,
772 .proc_handler = proc_dointvec_minmax,
773 .extra1 = &zero,
774 .extra2 = &two,
775 },
776 {
777 .procname = "udp_mem", 642 .procname = "udp_mem",
778 .data = &sysctl_udp_mem, 643 .data = &sysctl_udp_mem,
779 .maxlen = sizeof(sysctl_udp_mem), 644 .maxlen = sizeof(sysctl_udp_mem),
@@ -843,21 +708,29 @@ static struct ctl_table ipv4_net_table[] = {
843 .proc_handler = proc_dointvec 708 .proc_handler = proc_dointvec
844 }, 709 },
845 { 710 {
846 .procname = "ping_group_range", 711 .procname = "rt_cache_rebuild_count",
847 .data = &init_net.ipv4.sysctl_ping_group_range, 712 .data = &init_net.ipv4.sysctl_rt_cache_rebuild_count,
848 .maxlen = sizeof(gid_t)*2, 713 .maxlen = sizeof(int),
849 .mode = 0644, 714 .mode = 0644,
850 .proc_handler = ipv4_ping_group_range, 715 .proc_handler = proc_dointvec
851 }, 716 },
852 { 717 {
853 .procname = "tcp_mem", 718 .procname = "ping_group_range",
854 .maxlen = sizeof(init_net.ipv4.sysctl_tcp_mem), 719 .data = &init_net.ipv4.sysctl_ping_group_range,
720 .maxlen = sizeof(init_net.ipv4.sysctl_ping_group_range),
855 .mode = 0644, 721 .mode = 0644,
856 .proc_handler = ipv4_tcp_mem, 722 .proc_handler = ipv4_ping_group_range,
857 }, 723 },
858 { } 724 { }
859}; 725};
860 726
727struct ctl_path net_ipv4_ctl_path[] = {
728 { .procname = "net", },
729 { .procname = "ipv4", },
730 { },
731};
732EXPORT_SYMBOL_GPL(net_ipv4_ctl_path);
733
861static __net_init int ipv4_sysctl_init_net(struct net *net) 734static __net_init int ipv4_sysctl_init_net(struct net *net)
862{ 735{
863 struct ctl_table *table; 736 struct ctl_table *table;
@@ -881,23 +754,23 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
881 table[5].data = 754 table[5].data =
882 &net->ipv4.sysctl_icmp_ratemask; 755 &net->ipv4.sysctl_icmp_ratemask;
883 table[6].data = 756 table[6].data =
757 &net->ipv4.sysctl_rt_cache_rebuild_count;
758 table[7].data =
884 &net->ipv4.sysctl_ping_group_range; 759 &net->ipv4.sysctl_ping_group_range;
885 760
886 /* Don't export sysctls to unprivileged users */
887 if (net->user_ns != &init_user_ns)
888 table[0].procname = NULL;
889 } 761 }
890 762
891 /* 763 /*
892 * Sane defaults - nobody may create ping sockets. 764 * Sane defaults - nobody may create ping sockets.
893 * Boot scripts should set this to distro-specific group. 765 * Boot scripts should set this to distro-specific group.
894 */ 766 */
895 net->ipv4.sysctl_ping_group_range[0] = make_kgid(&init_user_ns, 1); 767 net->ipv4.sysctl_ping_group_range[0] = 1;
896 net->ipv4.sysctl_ping_group_range[1] = make_kgid(&init_user_ns, 0); 768 net->ipv4.sysctl_ping_group_range[1] = 0;
897 769
898 tcp_init_mem(net); 770 net->ipv4.sysctl_rt_cache_rebuild_count = 4;
899 771
900 net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table); 772 net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
773 net_ipv4_ctl_path, table);
901 if (net->ipv4.ipv4_hdr == NULL) 774 if (net->ipv4.ipv4_hdr == NULL)
902 goto err_reg; 775 goto err_reg;
903 776
@@ -938,12 +811,12 @@ static __init int sysctl_ipv4_init(void)
938 if (!i->procname) 811 if (!i->procname)
939 return -EINVAL; 812 return -EINVAL;
940 813
941 hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table); 814 hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table);
942 if (hdr == NULL) 815 if (hdr == NULL)
943 return -ENOMEM; 816 return -ENOMEM;
944 817
945 if (register_pernet_subsys(&ipv4_sysctl_ops)) { 818 if (register_pernet_subsys(&ipv4_sysctl_ops)) {
946 unregister_net_sysctl_table(hdr); 819 unregister_sysctl_table(hdr);
947 return -ENOMEM; 820 return -ENOMEM;
948 } 821 }
949 822
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2aa69c8ae60..09ced58e6a5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -245,8 +245,6 @@
245 * TCP_CLOSE socket is finished 245 * TCP_CLOSE socket is finished
246 */ 246 */
247 247
248#define pr_fmt(fmt) "TCP: " fmt
249
250#include <linux/kernel.h> 248#include <linux/kernel.h>
251#include <linux/module.h> 249#include <linux/module.h>
252#include <linux/types.h> 250#include <linux/types.h>
@@ -268,12 +266,15 @@
268#include <linux/crypto.h> 266#include <linux/crypto.h>
269#include <linux/time.h> 267#include <linux/time.h>
270#include <linux/slab.h> 268#include <linux/slab.h>
269#include <linux/uid_stat.h>
271 270
272#include <net/icmp.h> 271#include <net/icmp.h>
273#include <net/inet_common.h>
274#include <net/tcp.h> 272#include <net/tcp.h>
275#include <net/xfrm.h> 273#include <net/xfrm.h>
276#include <net/ip.h> 274#include <net/ip.h>
275#include <net/ip6_route.h>
276#include <net/ipv6.h>
277#include <net/transp_v6.h>
277#include <net/netdma.h> 278#include <net/netdma.h>
278#include <net/sock.h> 279#include <net/sock.h>
279 280
@@ -285,9 +286,11 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
285struct percpu_counter tcp_orphan_count; 286struct percpu_counter tcp_orphan_count;
286EXPORT_SYMBOL_GPL(tcp_orphan_count); 287EXPORT_SYMBOL_GPL(tcp_orphan_count);
287 288
289long sysctl_tcp_mem[3] __read_mostly;
288int sysctl_tcp_wmem[3] __read_mostly; 290int sysctl_tcp_wmem[3] __read_mostly;
289int sysctl_tcp_rmem[3] __read_mostly; 291int sysctl_tcp_rmem[3] __read_mostly;
290 292
293EXPORT_SYMBOL(sysctl_tcp_mem);
291EXPORT_SYMBOL(sysctl_tcp_rmem); 294EXPORT_SYMBOL(sysctl_tcp_rmem);
292EXPORT_SYMBOL(sysctl_tcp_wmem); 295EXPORT_SYMBOL(sysctl_tcp_wmem);
293 296
@@ -364,72 +367,6 @@ static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
364 return period; 367 return period;
365} 368}
366 369
367/* Address-family independent initialization for a tcp_sock.
368 *
369 * NOTE: A lot of things set to zero explicitly by call to
370 * sk_alloc() so need not be done here.
371 */
372void tcp_init_sock(struct sock *sk)
373{
374 struct inet_connection_sock *icsk = inet_csk(sk);
375 struct tcp_sock *tp = tcp_sk(sk);
376
377 skb_queue_head_init(&tp->out_of_order_queue);
378 tcp_init_xmit_timers(sk);
379 tcp_prequeue_init(tp);
380 INIT_LIST_HEAD(&tp->tsq_node);
381
382 icsk->icsk_rto = TCP_TIMEOUT_INIT;
383 tp->mdev = TCP_TIMEOUT_INIT;
384
385 /* So many TCP implementations out there (incorrectly) count the
386 * initial SYN frame in their delayed-ACK and congestion control
387 * algorithms that we must have the following bandaid to talk
388 * efficiently to them. -DaveM
389 */
390 tp->snd_cwnd = TCP_INIT_CWND;
391
392 /* See draft-stevens-tcpca-spec-01 for discussion of the
393 * initialization of these values.
394 */
395 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
396 tp->snd_cwnd_clamp = ~0;
397 tp->mss_cache = TCP_MSS_DEFAULT;
398
399 tp->reordering = sysctl_tcp_reordering;
400 tcp_enable_early_retrans(tp);
401 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
402
403 sk->sk_state = TCP_CLOSE;
404
405 sk->sk_write_space = sk_stream_write_space;
406 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
407
408 icsk->icsk_sync_mss = tcp_sync_mss;
409
410 /* TCP Cookie Transactions */
411 if (sysctl_tcp_cookie_size > 0) {
412 /* Default, cookies without s_data_payload. */
413 tp->cookie_values =
414 kzalloc(sizeof(*tp->cookie_values),
415 sk->sk_allocation);
416 if (tp->cookie_values != NULL)
417 kref_init(&tp->cookie_values->kref);
418 }
419 /* Presumed zeroed, in order of appearance:
420 * cookie_in_always, cookie_out_never,
421 * s_data_constant, s_data_in, s_data_out
422 */
423 sk->sk_sndbuf = sysctl_tcp_wmem[1];
424 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
425
426 local_bh_disable();
427 sock_update_memcg(sk);
428 sk_sockets_allocated_inc(sk);
429 local_bh_enable();
430}
431EXPORT_SYMBOL(tcp_init_sock);
432
433/* 370/*
434 * Wait for a TCP event. 371 * Wait for a TCP event.
435 * 372 *
@@ -441,7 +378,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
441{ 378{
442 unsigned int mask; 379 unsigned int mask;
443 struct sock *sk = sock->sk; 380 struct sock *sk = sock->sk;
444 const struct tcp_sock *tp = tcp_sk(sk); 381 struct tcp_sock *tp = tcp_sk(sk);
445 382
446 sock_poll_wait(file, sk_sleep(sk), wait); 383 sock_poll_wait(file, sk_sleep(sk), wait);
447 if (sk->sk_state == TCP_LISTEN) 384 if (sk->sk_state == TCP_LISTEN)
@@ -486,9 +423,8 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
486 if (sk->sk_shutdown & RCV_SHUTDOWN) 423 if (sk->sk_shutdown & RCV_SHUTDOWN)
487 mask |= POLLIN | POLLRDNORM | POLLRDHUP; 424 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
488 425
489 /* Connected or passive Fast Open socket? */ 426 /* Connected? */
490 if (sk->sk_state != TCP_SYN_SENT && 427 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
491 (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) {
492 int target = sock_rcvlowat(sk, 0, INT_MAX); 428 int target = sock_rcvlowat(sk, 0, INT_MAX);
493 429
494 if (tp->urg_seq == tp->copied_seq && 430 if (tp->urg_seq == tp->copied_seq &&
@@ -536,29 +472,30 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
536{ 472{
537 struct tcp_sock *tp = tcp_sk(sk); 473 struct tcp_sock *tp = tcp_sk(sk);
538 int answ; 474 int answ;
539 bool slow;
540 475
541 switch (cmd) { 476 switch (cmd) {
542 case SIOCINQ: 477 case SIOCINQ:
543 if (sk->sk_state == TCP_LISTEN) 478 if (sk->sk_state == TCP_LISTEN)
544 return -EINVAL; 479 return -EINVAL;
545 480
546 slow = lock_sock_fast(sk); 481 lock_sock(sk);
547 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) 482 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
548 answ = 0; 483 answ = 0;
549 else if (sock_flag(sk, SOCK_URGINLINE) || 484 else if (sock_flag(sk, SOCK_URGINLINE) ||
550 !tp->urg_data || 485 !tp->urg_data ||
551 before(tp->urg_seq, tp->copied_seq) || 486 before(tp->urg_seq, tp->copied_seq) ||
552 !before(tp->urg_seq, tp->rcv_nxt)) { 487 !before(tp->urg_seq, tp->rcv_nxt)) {
488 struct sk_buff *skb;
553 489
554 answ = tp->rcv_nxt - tp->copied_seq; 490 answ = tp->rcv_nxt - tp->copied_seq;
555 491
556 /* Subtract 1, if FIN was received */ 492 /* Subtract 1, if FIN is in queue. */
557 if (answ && sock_flag(sk, SOCK_DONE)) 493 skb = skb_peek_tail(&sk->sk_receive_queue);
558 answ--; 494 if (answ && skb)
495 answ -= tcp_hdr(skb)->fin;
559 } else 496 } else
560 answ = tp->urg_seq - tp->copied_seq; 497 answ = tp->urg_seq - tp->copied_seq;
561 unlock_sock_fast(sk, slow); 498 release_sock(sk);
562 break; 499 break;
563 case SIOCATMARK: 500 case SIOCATMARK:
564 answ = tp->urg_data && tp->urg_seq == tp->copied_seq; 501 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
@@ -591,11 +528,11 @@ EXPORT_SYMBOL(tcp_ioctl);
591 528
592static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) 529static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
593{ 530{
594 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; 531 TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
595 tp->pushed_seq = tp->write_seq; 532 tp->pushed_seq = tp->write_seq;
596} 533}
597 534
598static inline bool forced_push(const struct tcp_sock *tp) 535static inline int forced_push(struct tcp_sock *tp)
599{ 536{
600 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); 537 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
601} 538}
@@ -607,7 +544,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
607 544
608 skb->csum = 0; 545 skb->csum = 0;
609 tcb->seq = tcb->end_seq = tp->write_seq; 546 tcb->seq = tcb->end_seq = tp->write_seq;
610 tcb->tcp_flags = TCPHDR_ACK; 547 tcb->flags = TCPHDR_ACK;
611 tcb->sacked = 0; 548 tcb->sacked = 0;
612 skb_header_release(skb); 549 skb_header_release(skb);
613 tcp_add_write_queue_tail(sk, skb); 550 tcp_add_write_queue_tail(sk, skb);
@@ -768,12 +705,11 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
768 skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); 705 skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
769 if (skb) { 706 if (skb) {
770 if (sk_wmem_schedule(sk, skb->truesize)) { 707 if (sk_wmem_schedule(sk, skb->truesize)) {
771 skb_reserve(skb, sk->sk_prot->max_header);
772 /* 708 /*
773 * Make sure that we have exactly size bytes 709 * Make sure that we have exactly size bytes
774 * available to the caller, no more, no less. 710 * available to the caller, no more, no less.
775 */ 711 */
776 skb->avail_size = size; 712 skb_reserve(skb, skb_tailroom(skb) - size);
777 return skb; 713 return skb;
778 } 714 }
779 __kfree_skb(skb); 715 __kfree_skb(skb);
@@ -798,10 +734,6 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
798 inet_csk(sk)->icsk_ext_hdr_len - 734 inet_csk(sk)->icsk_ext_hdr_len -
799 tp->tcp_header_len); 735 tp->tcp_header_len);
800 736
801 /* TSQ : try to have two TSO segments in flight */
802 xmit_size_goal = min_t(u32, xmit_size_goal,
803 sysctl_tcp_limit_output_bytes >> 1);
804
805 xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); 737 xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
806 738
807 /* We try hard to avoid divides here */ 739 /* We try hard to avoid divides here */
@@ -811,9 +743,7 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
811 old_size_goal + mss_now > xmit_size_goal)) { 743 old_size_goal + mss_now > xmit_size_goal)) {
812 xmit_size_goal = old_size_goal; 744 xmit_size_goal = old_size_goal;
813 } else { 745 } else {
814 tp->xmit_size_goal_segs = 746 tp->xmit_size_goal_segs = xmit_size_goal / mss_now;
815 min_t(u16, xmit_size_goal / mss_now,
816 sk->sk_gso_max_segs);
817 xmit_size_goal = tp->xmit_size_goal_segs * mss_now; 747 xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
818 } 748 }
819 } 749 }
@@ -831,8 +761,8 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
831 return mss_now; 761 return mss_now;
832} 762}
833 763
834static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, 764static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
835 size_t size, int flags) 765 size_t psize, int flags)
836{ 766{
837 struct tcp_sock *tp = tcp_sk(sk); 767 struct tcp_sock *tp = tcp_sk(sk);
838 int mss_now, size_goal; 768 int mss_now, size_goal;
@@ -840,15 +770,10 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
840 ssize_t copied; 770 ssize_t copied;
841 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 771 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
842 772
843 /* Wait for a connection to finish. One exception is TCP Fast Open 773 /* Wait for a connection to finish. */
844 * (passive side) where data is allowed to be sent before a connection 774 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
845 * is fully established.
846 */
847 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
848 !tcp_passive_fastopen(sk)) {
849 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) 775 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
850 goto out_err; 776 goto out_err;
851 }
852 777
853 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 778 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
854 779
@@ -859,10 +784,12 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
859 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) 784 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
860 goto out_err; 785 goto out_err;
861 786
862 while (size > 0) { 787 while (psize > 0) {
863 struct sk_buff *skb = tcp_write_queue_tail(sk); 788 struct sk_buff *skb = tcp_write_queue_tail(sk);
864 int copy, i; 789 struct page *page = pages[poffset / PAGE_SIZE];
865 bool can_coalesce; 790 int copy, i, can_coalesce;
791 int offset = poffset % PAGE_SIZE;
792 int size = min_t(size_t, psize, PAGE_SIZE - offset);
866 793
867 if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) { 794 if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
868new_segment: 795new_segment:
@@ -890,7 +817,7 @@ new_segment:
890 goto wait_for_memory; 817 goto wait_for_memory;
891 818
892 if (can_coalesce) { 819 if (can_coalesce) {
893 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 820 skb_shinfo(skb)->frags[i - 1].size += copy;
894 } else { 821 } else {
895 get_page(page); 822 get_page(page);
896 skb_fill_page_desc(skb, i, page, offset, copy); 823 skb_fill_page_desc(skb, i, page, offset, copy);
@@ -907,11 +834,11 @@ new_segment:
907 skb_shinfo(skb)->gso_segs = 0; 834 skb_shinfo(skb)->gso_segs = 0;
908 835
909 if (!copied) 836 if (!copied)
910 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; 837 TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH;
911 838
912 copied += copy; 839 copied += copy;
913 offset += copy; 840 poffset += copy;
914 if (!(size -= copy)) 841 if (!(psize -= copy))
915 goto out; 842 goto out;
916 843
917 if (skb->len < size_goal || (flags & MSG_OOB)) 844 if (skb->len < size_goal || (flags & MSG_OOB))
@@ -927,7 +854,8 @@ new_segment:
927wait_for_sndbuf: 854wait_for_sndbuf:
928 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 855 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
929wait_for_memory: 856wait_for_memory:
930 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); 857 if (copied)
858 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
931 859
932 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) 860 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
933 goto do_error; 861 goto do_error;
@@ -936,7 +864,7 @@ wait_for_memory:
936 } 864 }
937 865
938out: 866out:
939 if (copied && !(flags & MSG_SENDPAGE_NOTLAST)) 867 if (copied)
940 tcp_push(sk, flags, mss_now, tp->nonagle); 868 tcp_push(sk, flags, mss_now, tp->nonagle);
941 return copied; 869 return copied;
942 870
@@ -958,24 +886,24 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
958 flags); 886 flags);
959 887
960 lock_sock(sk); 888 lock_sock(sk);
961 res = do_tcp_sendpages(sk, page, offset, size, flags); 889 res = do_tcp_sendpages(sk, &page, offset, size, flags);
962 release_sock(sk); 890 release_sock(sk);
963 return res; 891 return res;
964} 892}
965EXPORT_SYMBOL(tcp_sendpage); 893EXPORT_SYMBOL(tcp_sendpage);
966 894
967static inline int select_size(const struct sock *sk, bool sg) 895#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
896#define TCP_OFF(sk) (sk->sk_sndmsg_off)
897
898static inline int select_size(struct sock *sk, int sg)
968{ 899{
969 const struct tcp_sock *tp = tcp_sk(sk); 900 struct tcp_sock *tp = tcp_sk(sk);
970 int tmp = tp->mss_cache; 901 int tmp = tp->mss_cache;
971 902
972 if (sg) { 903 if (sg) {
973 if (sk_can_gso(sk)) { 904 if (sk_can_gso(sk))
974 /* Small frames wont use a full page: 905 tmp = 0;
975 * Payload will immediately follow tcp header. 906 else {
976 */
977 tmp = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
978 } else {
979 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); 907 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
980 908
981 if (tmp >= pgbreak && 909 if (tmp >= pgbreak &&
@@ -987,86 +915,27 @@ static inline int select_size(const struct sock *sk, bool sg)
987 return tmp; 915 return tmp;
988} 916}
989 917
990void tcp_free_fastopen_req(struct tcp_sock *tp)
991{
992 if (tp->fastopen_req != NULL) {
993 kfree(tp->fastopen_req);
994 tp->fastopen_req = NULL;
995 }
996}
997
998static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *size)
999{
1000 struct tcp_sock *tp = tcp_sk(sk);
1001 int err, flags;
1002
1003 if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
1004 return -EOPNOTSUPP;
1005 if (tp->fastopen_req != NULL)
1006 return -EALREADY; /* Another Fast Open is in progress */
1007
1008 tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
1009 sk->sk_allocation);
1010 if (unlikely(tp->fastopen_req == NULL))
1011 return -ENOBUFS;
1012 tp->fastopen_req->data = msg;
1013
1014 flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
1015 err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
1016 msg->msg_namelen, flags);
1017 *size = tp->fastopen_req->copied;
1018 tcp_free_fastopen_req(tp);
1019 return err;
1020}
1021
1022int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, 918int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1023 size_t size) 919 size_t size)
1024{ 920{
1025 struct iovec *iov; 921 struct iovec *iov;
1026 struct tcp_sock *tp = tcp_sk(sk); 922 struct tcp_sock *tp = tcp_sk(sk);
1027 struct sk_buff *skb; 923 struct sk_buff *skb;
1028 int iovlen, flags, err, copied = 0; 924 int iovlen, flags;
1029 int mss_now = 0, size_goal, copied_syn = 0, offset = 0; 925 int mss_now, size_goal;
1030 bool sg; 926 int sg, err, copied;
1031 long timeo; 927 long timeo;
1032 928
1033 lock_sock(sk); 929 lock_sock(sk);
1034 930
1035 flags = msg->msg_flags; 931 flags = msg->msg_flags;
1036 if (flags & MSG_FASTOPEN) {
1037 err = tcp_sendmsg_fastopen(sk, msg, &copied_syn);
1038 if (err == -EINPROGRESS && copied_syn > 0)
1039 goto out;
1040 else if (err)
1041 goto out_err;
1042 offset = copied_syn;
1043 }
1044
1045 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 932 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1046 933
1047 /* Wait for a connection to finish. One exception is TCP Fast Open 934 /* Wait for a connection to finish. */
1048 * (passive side) where data is allowed to be sent before a connection 935 if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1049 * is fully established.
1050 */
1051 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1052 !tcp_passive_fastopen(sk)) {
1053 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) 936 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
1054 goto do_error;
1055 }
1056
1057 if (unlikely(tp->repair)) {
1058 if (tp->repair_queue == TCP_RECV_QUEUE) {
1059 copied = tcp_send_rcvq(sk, msg, size);
1060 goto out;
1061 }
1062
1063 err = -EINVAL;
1064 if (tp->repair_queue == TCP_NO_QUEUE)
1065 goto out_err; 937 goto out_err;
1066 938
1067 /* 'common' sending to sendq */
1068 }
1069
1070 /* This should be in poll */ 939 /* This should be in poll */
1071 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 940 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1072 941
@@ -1081,22 +950,13 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1081 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) 950 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1082 goto out_err; 951 goto out_err;
1083 952
1084 sg = !!(sk->sk_route_caps & NETIF_F_SG); 953 sg = sk->sk_route_caps & NETIF_F_SG;
1085 954
1086 while (--iovlen >= 0) { 955 while (--iovlen >= 0) {
1087 size_t seglen = iov->iov_len; 956 size_t seglen = iov->iov_len;
1088 unsigned char __user *from = iov->iov_base; 957 unsigned char __user *from = iov->iov_base;
1089 958
1090 iov++; 959 iov++;
1091 if (unlikely(offset > 0)) { /* Skip bytes copied in SYN */
1092 if (offset >= seglen) {
1093 offset -= seglen;
1094 continue;
1095 }
1096 seglen -= offset;
1097 from += offset;
1098 offset = 0;
1099 }
1100 960
1101 while (seglen > 0) { 961 while (seglen > 0) {
1102 int copy = 0; 962 int copy = 0;
@@ -1139,54 +999,86 @@ new_segment:
1139 copy = seglen; 999 copy = seglen;
1140 1000
1141 /* Where to copy to? */ 1001 /* Where to copy to? */
1142 if (skb_availroom(skb) > 0) { 1002 if (skb_tailroom(skb) > 0) {
1143 /* We have some space in skb head. Superb! */ 1003 /* We have some space in skb head. Superb! */
1144 copy = min_t(int, copy, skb_availroom(skb)); 1004 if (copy > skb_tailroom(skb))
1005 copy = skb_tailroom(skb);
1145 err = skb_add_data_nocache(sk, skb, from, copy); 1006 err = skb_add_data_nocache(sk, skb, from, copy);
1146 if (err) 1007 if (err)
1147 goto do_fault; 1008 goto do_fault;
1148 } else { 1009 } else {
1149 bool merge = true; 1010 int merge = 0;
1150 int i = skb_shinfo(skb)->nr_frags; 1011 int i = skb_shinfo(skb)->nr_frags;
1151 struct page_frag *pfrag = sk_page_frag(sk); 1012 struct page *page = TCP_PAGE(sk);
1152 1013 int off = TCP_OFF(sk);
1153 if (!sk_page_frag_refill(sk, pfrag)) 1014
1154 goto wait_for_memory; 1015 if (skb_can_coalesce(skb, i, page, off) &&
1155 1016 off != PAGE_SIZE) {
1156 if (!skb_can_coalesce(skb, i, pfrag->page, 1017 /* We can extend the last page
1157 pfrag->offset)) { 1018 * fragment. */
1158 if (i == MAX_SKB_FRAGS || !sg) { 1019 merge = 1;
1159 tcp_mark_push(tp, skb); 1020 } else if (i == MAX_SKB_FRAGS || !sg) {
1160 goto new_segment; 1021 /* Need to add new fragment and cannot
1022 * do this because interface is non-SG,
1023 * or because all the page slots are
1024 * busy. */
1025 tcp_mark_push(tp, skb);
1026 goto new_segment;
1027 } else if (page) {
1028 if (off == PAGE_SIZE) {
1029 put_page(page);
1030 TCP_PAGE(sk) = page = NULL;
1031 off = 0;
1161 } 1032 }
1162 merge = false; 1033 } else
1163 } 1034 off = 0;
1164 1035
1165 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1036 if (copy > PAGE_SIZE - off)
1037 copy = PAGE_SIZE - off;
1166 1038
1167 if (!sk_wmem_schedule(sk, copy)) 1039 if (!sk_wmem_schedule(sk, copy))
1168 goto wait_for_memory; 1040 goto wait_for_memory;
1169 1041
1042 if (!page) {
1043 /* Allocate new cache page. */
1044 if (!(page = sk_stream_alloc_page(sk)))
1045 goto wait_for_memory;
1046 }
1047
1048 /* Time to copy data. We are close to
1049 * the end! */
1170 err = skb_copy_to_page_nocache(sk, from, skb, 1050 err = skb_copy_to_page_nocache(sk, from, skb,
1171 pfrag->page, 1051 page, off, copy);
1172 pfrag->offset, 1052 if (err) {
1173 copy); 1053 /* If this page was new, give it to the
1174 if (err) 1054 * socket so it does not get leaked.
1055 */
1056 if (!TCP_PAGE(sk)) {
1057 TCP_PAGE(sk) = page;
1058 TCP_OFF(sk) = 0;
1059 }
1175 goto do_error; 1060 goto do_error;
1061 }
1176 1062
1177 /* Update the skb. */ 1063 /* Update the skb. */
1178 if (merge) { 1064 if (merge) {
1179 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1065 skb_shinfo(skb)->frags[i - 1].size +=
1066 copy;
1180 } else { 1067 } else {
1181 skb_fill_page_desc(skb, i, pfrag->page, 1068 skb_fill_page_desc(skb, i, page, off, copy);
1182 pfrag->offset, copy); 1069 if (TCP_PAGE(sk)) {
1183 get_page(pfrag->page); 1070 get_page(page);
1071 } else if (off + copy < PAGE_SIZE) {
1072 get_page(page);
1073 TCP_PAGE(sk) = page;
1074 }
1184 } 1075 }
1185 pfrag->offset += copy; 1076
1077 TCP_OFF(sk) = off + copy;
1186 } 1078 }
1187 1079
1188 if (!copied) 1080 if (!copied)
1189 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; 1081 TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH;
1190 1082
1191 tp->write_seq += copy; 1083 tp->write_seq += copy;
1192 TCP_SKB_CB(skb)->end_seq += copy; 1084 TCP_SKB_CB(skb)->end_seq += copy;
@@ -1197,7 +1089,7 @@ new_segment:
1197 if ((seglen -= copy) == 0 && iovlen == 0) 1089 if ((seglen -= copy) == 0 && iovlen == 0)
1198 goto out; 1090 goto out;
1199 1091
1200 if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair)) 1092 if (skb->len < max || (flags & MSG_OOB))
1201 continue; 1093 continue;
1202 1094
1203 if (forced_push(tp)) { 1095 if (forced_push(tp)) {
@@ -1224,7 +1116,10 @@ out:
1224 if (copied) 1116 if (copied)
1225 tcp_push(sk, flags, mss_now, tp->nonagle); 1117 tcp_push(sk, flags, mss_now, tp->nonagle);
1226 release_sock(sk); 1118 release_sock(sk);
1227 return copied + copied_syn; 1119
1120 if (copied > 0)
1121 uid_stat_tcp_snd(current_uid(), copied);
1122 return copied;
1228 1123
1229do_fault: 1124do_fault:
1230 if (!skb->len) { 1125 if (!skb->len) {
@@ -1237,7 +1132,7 @@ do_fault:
1237 } 1132 }
1238 1133
1239do_error: 1134do_error:
1240 if (copied + copied_syn) 1135 if (copied)
1241 goto out; 1136 goto out;
1242out_err: 1137out_err:
1243 err = sk_stream_error(sk, flags, err); 1138 err = sk_stream_error(sk, flags, err);
@@ -1295,24 +1190,6 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1295 return -EAGAIN; 1190 return -EAGAIN;
1296} 1191}
1297 1192
1298static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1299{
1300 struct sk_buff *skb;
1301 int copied = 0, err = 0;
1302
1303 /* XXX -- need to support SO_PEEK_OFF */
1304
1305 skb_queue_walk(&sk->sk_write_queue, skb) {
1306 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len);
1307 if (err)
1308 break;
1309
1310 copied += skb->len;
1311 }
1312
1313 return err ?: copied;
1314}
1315
1316/* Clean up the receive buffer for full frames taken by the user, 1193/* Clean up the receive buffer for full frames taken by the user,
1317 * then send an ACK if necessary. COPIED is the number of bytes 1194 * then send an ACK if necessary. COPIED is the number of bytes
1318 * tcp_recvmsg has given to the user so far, it speeds up the 1195 * tcp_recvmsg has given to the user so far, it speeds up the
@@ -1322,13 +1199,15 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1322void tcp_cleanup_rbuf(struct sock *sk, int copied) 1199void tcp_cleanup_rbuf(struct sock *sk, int copied)
1323{ 1200{
1324 struct tcp_sock *tp = tcp_sk(sk); 1201 struct tcp_sock *tp = tcp_sk(sk);
1325 bool time_to_ack = false; 1202 int time_to_ack = 0;
1326 1203
1204#if TCP_DEBUG
1327 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); 1205 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1328 1206
1329 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), 1207 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1330 "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", 1208 "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1331 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); 1209 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1210#endif
1332 1211
1333 if (inet_csk_ack_scheduled(sk)) { 1212 if (inet_csk_ack_scheduled(sk)) {
1334 const struct inet_connection_sock *icsk = inet_csk(sk); 1213 const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1348,7 +1227,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
1348 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && 1227 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1349 !icsk->icsk_ack.pingpong)) && 1228 !icsk->icsk_ack.pingpong)) &&
1350 !atomic_read(&sk->sk_rmem_alloc))) 1229 !atomic_read(&sk->sk_rmem_alloc)))
1351 time_to_ack = true; 1230 time_to_ack = 1;
1352 } 1231 }
1353 1232
1354 /* We send an ACK if we can now advertise a non-zero window 1233 /* We send an ACK if we can now advertise a non-zero window
@@ -1370,7 +1249,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
1370 * "Lots" means "at least twice" here. 1249 * "Lots" means "at least twice" here.
1371 */ 1250 */
1372 if (new_window && new_window >= 2 * rcv_window_now) 1251 if (new_window && new_window >= 2 * rcv_window_now)
1373 time_to_ack = true; 1252 time_to_ack = 1;
1374 } 1253 }
1375 } 1254 }
1376 if (time_to_ack) 1255 if (time_to_ack)
@@ -1428,12 +1307,12 @@ static void tcp_service_net_dma(struct sock *sk, bool wait)
1428} 1307}
1429#endif 1308#endif
1430 1309
1431static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) 1310static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1432{ 1311{
1433 struct sk_buff *skb; 1312 struct sk_buff *skb;
1434 u32 offset; 1313 u32 offset;
1435 1314
1436 while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { 1315 skb_queue_walk(&sk->sk_receive_queue, skb) {
1437 offset = seq - TCP_SKB_CB(skb)->seq; 1316 offset = seq - TCP_SKB_CB(skb)->seq;
1438 if (tcp_hdr(skb)->syn) 1317 if (tcp_hdr(skb)->syn)
1439 offset--; 1318 offset--;
@@ -1441,11 +1320,6 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1441 *off = offset; 1320 *off = offset;
1442 return skb; 1321 return skb;
1443 } 1322 }
1444 /* This looks weird, but this can happen if TCP collapsing
1445 * splitted a fat GRO packet, while we released socket lock
1446 * in skb_splice_bits()
1447 */
1448 sk_eat_skb(sk, skb, false);
1449 } 1323 }
1450 return NULL; 1324 return NULL;
1451} 1325}
@@ -1487,7 +1361,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1487 break; 1361 break;
1488 } 1362 }
1489 used = recv_actor(desc, skb, offset, len); 1363 used = recv_actor(desc, skb, offset, len);
1490 if (used <= 0) { 1364 if (used < 0) {
1491 if (!copied) 1365 if (!copied)
1492 copied = used; 1366 copied = used;
1493 break; 1367 break;
@@ -1496,26 +1370,22 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1496 copied += used; 1370 copied += used;
1497 offset += used; 1371 offset += used;
1498 } 1372 }
1499 /* If recv_actor drops the lock (e.g. TCP splice 1373 /*
1374 * If recv_actor drops the lock (e.g. TCP splice
1500 * receive) the skb pointer might be invalid when 1375 * receive) the skb pointer might be invalid when
1501 * getting here: tcp_collapse might have deleted it 1376 * getting here: tcp_collapse might have deleted it
1502 * while aggregating skbs from the socket queue. 1377 * while aggregating skbs from the socket queue.
1503 */ 1378 */
1504 skb = tcp_recv_skb(sk, seq - 1, &offset); 1379 skb = tcp_recv_skb(sk, seq-1, &offset);
1505 if (!skb) 1380 if (!skb || (offset+1 != skb->len))
1506 break; 1381 break;
1507 /* TCP coalescing might have appended data to the skb.
1508 * Try to splice more frags
1509 */
1510 if (offset + 1 != skb->len)
1511 continue;
1512 } 1382 }
1513 if (tcp_hdr(skb)->fin) { 1383 if (tcp_hdr(skb)->fin) {
1514 sk_eat_skb(sk, skb, false); 1384 sk_eat_skb(sk, skb, 0);
1515 ++seq; 1385 ++seq;
1516 break; 1386 break;
1517 } 1387 }
1518 sk_eat_skb(sk, skb, false); 1388 sk_eat_skb(sk, skb, 0);
1519 if (!desc->count) 1389 if (!desc->count)
1520 break; 1390 break;
1521 tp->copied_seq = seq; 1391 tp->copied_seq = seq;
@@ -1526,9 +1396,10 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1526 1396
1527 /* Clean up data we have read: This will do ACK frames. */ 1397 /* Clean up data we have read: This will do ACK frames. */
1528 if (copied > 0) { 1398 if (copied > 0) {
1529 tcp_recv_skb(sk, seq, &offset);
1530 tcp_cleanup_rbuf(sk, copied); 1399 tcp_cleanup_rbuf(sk, copied);
1400 uid_stat_tcp_rcv(current_uid(), copied);
1531 } 1401 }
1402
1532 return copied; 1403 return copied;
1533} 1404}
1534EXPORT_SYMBOL(tcp_read_sock); 1405EXPORT_SYMBOL(tcp_read_sock);
@@ -1553,7 +1424,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1553 int target; /* Read at least this many bytes */ 1424 int target; /* Read at least this many bytes */
1554 long timeo; 1425 long timeo;
1555 struct task_struct *user_recv = NULL; 1426 struct task_struct *user_recv = NULL;
1556 bool copied_early = false; 1427 int copied_early = 0;
1557 struct sk_buff *skb; 1428 struct sk_buff *skb;
1558 u32 urg_hole = 0; 1429 u32 urg_hole = 0;
1559 1430
@@ -1569,21 +1440,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1569 if (flags & MSG_OOB) 1440 if (flags & MSG_OOB)
1570 goto recv_urg; 1441 goto recv_urg;
1571 1442
1572 if (unlikely(tp->repair)) {
1573 err = -EPERM;
1574 if (!(flags & MSG_PEEK))
1575 goto out;
1576
1577 if (tp->repair_queue == TCP_SEND_QUEUE)
1578 goto recv_sndq;
1579
1580 err = -EINVAL;
1581 if (tp->repair_queue == TCP_NO_QUEUE)
1582 goto out;
1583
1584 /* 'common' recv queue MSG_PEEK-ing */
1585 }
1586
1587 seq = &tp->copied_seq; 1443 seq = &tp->copied_seq;
1588 if (flags & MSG_PEEK) { 1444 if (flags & MSG_PEEK) {
1589 peek_seq = tp->copied_seq; 1445 peek_seq = tp->copied_seq;
@@ -1604,7 +1460,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1604 if ((available < target) && 1460 if ((available < target) &&
1605 (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && 1461 (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
1606 !sysctl_tcp_low_latency && 1462 !sysctl_tcp_low_latency &&
1607 net_dma_find_channel()) { 1463 dma_find_channel(DMA_MEMCPY)) {
1608 preempt_enable_no_resched(); 1464 preempt_enable_no_resched();
1609 tp->ucopy.pinned_list = 1465 tp->ucopy.pinned_list =
1610 dma_pin_iovec_pages(msg->msg_iov, len); 1466 dma_pin_iovec_pages(msg->msg_iov, len);
@@ -1745,14 +1601,8 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1745 } 1601 }
1746 1602
1747#ifdef CONFIG_NET_DMA 1603#ifdef CONFIG_NET_DMA
1748 if (tp->ucopy.dma_chan) { 1604 if (tp->ucopy.dma_chan)
1749 if (tp->rcv_wnd == 0 && 1605 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1750 !skb_queue_empty(&sk->sk_async_wait_queue)) {
1751 tcp_service_net_dma(sk, true);
1752 tcp_cleanup_rbuf(sk, copied);
1753 } else
1754 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1755 }
1756#endif 1606#endif
1757 if (copied >= target) { 1607 if (copied >= target) {
1758 /* Do not sleep, just process backlog. */ 1608 /* Do not sleep, just process backlog. */
@@ -1791,9 +1641,9 @@ do_prequeue:
1791 } 1641 }
1792 if ((flags & MSG_PEEK) && 1642 if ((flags & MSG_PEEK) &&
1793 (peek_seq - copied - urg_hole != tp->copied_seq)) { 1643 (peek_seq - copied - urg_hole != tp->copied_seq)) {
1794 net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n", 1644 if (net_ratelimit())
1795 current->comm, 1645 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1796 task_pid_nr(current)); 1646 current->comm, task_pid_nr(current));
1797 peek_seq = tp->copied_seq; 1647 peek_seq = tp->copied_seq;
1798 } 1648 }
1799 continue; 1649 continue;
@@ -1825,7 +1675,7 @@ do_prequeue:
1825 if (!(flags & MSG_TRUNC)) { 1675 if (!(flags & MSG_TRUNC)) {
1826#ifdef CONFIG_NET_DMA 1676#ifdef CONFIG_NET_DMA
1827 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) 1677 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1828 tp->ucopy.dma_chan = net_dma_find_channel(); 1678 tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1829 1679
1830 if (tp->ucopy.dma_chan) { 1680 if (tp->ucopy.dma_chan) {
1831 tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec( 1681 tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
@@ -1835,8 +1685,7 @@ do_prequeue:
1835 1685
1836 if (tp->ucopy.dma_cookie < 0) { 1686 if (tp->ucopy.dma_cookie < 0) {
1837 1687
1838 pr_alert("%s: dma_cookie < 0\n", 1688 printk(KERN_ALERT "dma_cookie < 0\n");
1839 __func__);
1840 1689
1841 /* Exception. Bailout! */ 1690 /* Exception. Bailout! */
1842 if (!copied) 1691 if (!copied)
@@ -1847,7 +1696,7 @@ do_prequeue:
1847 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); 1696 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1848 1697
1849 if ((offset + used) == skb->len) 1698 if ((offset + used) == skb->len)
1850 copied_early = true; 1699 copied_early = 1;
1851 1700
1852 } else 1701 } else
1853#endif 1702#endif
@@ -1881,7 +1730,7 @@ skip_copy:
1881 goto found_fin_ok; 1730 goto found_fin_ok;
1882 if (!(flags & MSG_PEEK)) { 1731 if (!(flags & MSG_PEEK)) {
1883 sk_eat_skb(sk, skb, copied_early); 1732 sk_eat_skb(sk, skb, copied_early);
1884 copied_early = false; 1733 copied_early = 0;
1885 } 1734 }
1886 continue; 1735 continue;
1887 1736
@@ -1890,7 +1739,7 @@ skip_copy:
1890 ++*seq; 1739 ++*seq;
1891 if (!(flags & MSG_PEEK)) { 1740 if (!(flags & MSG_PEEK)) {
1892 sk_eat_skb(sk, skb, copied_early); 1741 sk_eat_skb(sk, skb, copied_early);
1893 copied_early = false; 1742 copied_early = 0;
1894 } 1743 }
1895 break; 1744 break;
1896 } while (len > 0); 1745 } while (len > 0);
@@ -1932,6 +1781,9 @@ skip_copy:
1932 tcp_cleanup_rbuf(sk, copied); 1781 tcp_cleanup_rbuf(sk, copied);
1933 1782
1934 release_sock(sk); 1783 release_sock(sk);
1784
1785 if (copied > 0)
1786 uid_stat_tcp_rcv(current_uid(), copied);
1935 return copied; 1787 return copied;
1936 1788
1937out: 1789out:
@@ -1940,10 +1792,8 @@ out:
1940 1792
1941recv_urg: 1793recv_urg:
1942 err = tcp_recv_urg(sk, msg, len, flags); 1794 err = tcp_recv_urg(sk, msg, len, flags);
1943 goto out; 1795 if (err > 0)
1944 1796 uid_stat_tcp_rcv(current_uid(), err);
1945recv_sndq:
1946 err = tcp_peek_sndq(sk, msg, len);
1947 goto out; 1797 goto out;
1948} 1798}
1949EXPORT_SYMBOL(tcp_recvmsg); 1799EXPORT_SYMBOL(tcp_recvmsg);
@@ -2041,20 +1891,6 @@ void tcp_shutdown(struct sock *sk, int how)
2041} 1891}
2042EXPORT_SYMBOL(tcp_shutdown); 1892EXPORT_SYMBOL(tcp_shutdown);
2043 1893
2044bool tcp_check_oom(struct sock *sk, int shift)
2045{
2046 bool too_many_orphans, out_of_socket_memory;
2047
2048 too_many_orphans = tcp_too_many_orphans(sk, shift);
2049 out_of_socket_memory = tcp_out_of_memory(sk);
2050
2051 if (too_many_orphans)
2052 net_info_ratelimited("too many orphaned sockets\n");
2053 if (out_of_socket_memory)
2054 net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
2055 return too_many_orphans || out_of_socket_memory;
2056}
2057
2058void tcp_close(struct sock *sk, long timeout) 1894void tcp_close(struct sock *sk, long timeout)
2059{ 1895{
2060 struct sk_buff *skb; 1896 struct sk_buff *skb;
@@ -2097,9 +1933,7 @@ void tcp_close(struct sock *sk, long timeout)
2097 * advertise a zero window, then kill -9 the FTP client, wheee... 1933 * advertise a zero window, then kill -9 the FTP client, wheee...
2098 * Note: timeout is always zero in such a case. 1934 * Note: timeout is always zero in such a case.
2099 */ 1935 */
2100 if (unlikely(tcp_sk(sk)->repair)) { 1936 if (data_was_unread) {
2101 sk->sk_prot->disconnect(sk, 0);
2102 } else if (data_was_unread) {
2103 /* Unread data was tossed, zap the connection. */ 1937 /* Unread data was tossed, zap the connection. */
2104 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); 1938 NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
2105 tcp_set_state(sk, TCP_CLOSE); 1939 tcp_set_state(sk, TCP_CLOSE);
@@ -2133,10 +1967,6 @@ void tcp_close(struct sock *sk, long timeout)
2133 * they look as CLOSING or LAST_ACK for Linux) 1967 * they look as CLOSING or LAST_ACK for Linux)
2134 * Probably, I missed some more holelets. 1968 * Probably, I missed some more holelets.
2135 * --ANK 1969 * --ANK
2136 * XXX (TFO) - To start off we don't support SYN+ACK+FIN
2137 * in a single packet! (May consider it later but will
2138 * probably need API support or TCP_CORK SYN-ACK until
2139 * data is written and socket is closed.)
2140 */ 1970 */
2141 tcp_send_fin(sk); 1971 tcp_send_fin(sk);
2142 } 1972 }
@@ -2200,7 +2030,10 @@ adjudge_to_death:
2200 } 2030 }
2201 if (sk->sk_state != TCP_CLOSE) { 2031 if (sk->sk_state != TCP_CLOSE) {
2202 sk_mem_reclaim(sk); 2032 sk_mem_reclaim(sk);
2203 if (tcp_check_oom(sk, 0)) { 2033 if (tcp_too_many_orphans(sk, 0)) {
2034 if (net_ratelimit())
2035 printk(KERN_INFO "TCP: too many of orphaned "
2036 "sockets\n");
2204 tcp_set_state(sk, TCP_CLOSE); 2037 tcp_set_state(sk, TCP_CLOSE);
2205 tcp_send_active_reset(sk, GFP_ATOMIC); 2038 tcp_send_active_reset(sk, GFP_ATOMIC);
2206 NET_INC_STATS_BH(sock_net(sk), 2039 NET_INC_STATS_BH(sock_net(sk),
@@ -2208,16 +2041,8 @@ adjudge_to_death:
2208 } 2041 }
2209 } 2042 }
2210 2043
2211 if (sk->sk_state == TCP_CLOSE) { 2044 if (sk->sk_state == TCP_CLOSE)
2212 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
2213 /* We could get here with a non-NULL req if the socket is
2214 * aborted (e.g., closed with unread data) before 3WHS
2215 * finishes.
2216 */
2217 if (req != NULL)
2218 reqsk_fastopen_remove(sk, req, false);
2219 inet_csk_destroy_sock(sk); 2045 inet_csk_destroy_sock(sk);
2220 }
2221 /* Otherwise, socket is reprieved until protocol close. */ 2046 /* Otherwise, socket is reprieved until protocol close. */
2222 2047
2223out: 2048out:
@@ -2229,7 +2054,7 @@ EXPORT_SYMBOL(tcp_close);
2229 2054
2230/* These states need RST on ABORT according to RFC793 */ 2055/* These states need RST on ABORT according to RFC793 */
2231 2056
2232static inline bool tcp_need_reset(int state) 2057static inline int tcp_need_reset(int state)
2233{ 2058{
2234 return (1 << state) & 2059 return (1 << state) &
2235 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | 2060 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
@@ -2250,8 +2075,6 @@ int tcp_disconnect(struct sock *sk, int flags)
2250 /* ABORT function of RFC793 */ 2075 /* ABORT function of RFC793 */
2251 if (old_state == TCP_LISTEN) { 2076 if (old_state == TCP_LISTEN) {
2252 inet_csk_listen_stop(sk); 2077 inet_csk_listen_stop(sk);
2253 } else if (unlikely(tp->repair)) {
2254 sk->sk_err = ECONNABORTED;
2255 } else if (tcp_need_reset(old_state) || 2078 } else if (tcp_need_reset(old_state) ||
2256 (tp->snd_nxt != tp->write_seq && 2079 (tp->snd_nxt != tp->write_seq &&
2257 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { 2080 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
@@ -2303,68 +2126,6 @@ int tcp_disconnect(struct sock *sk, int flags)
2303} 2126}
2304EXPORT_SYMBOL(tcp_disconnect); 2127EXPORT_SYMBOL(tcp_disconnect);
2305 2128
2306void tcp_sock_destruct(struct sock *sk)
2307{
2308 inet_sock_destruct(sk);
2309
2310 kfree(inet_csk(sk)->icsk_accept_queue.fastopenq);
2311}
2312
2313static inline bool tcp_can_repair_sock(const struct sock *sk)
2314{
2315 return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
2316 ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED));
2317}
2318
2319static int tcp_repair_options_est(struct tcp_sock *tp,
2320 struct tcp_repair_opt __user *optbuf, unsigned int len)
2321{
2322 struct tcp_repair_opt opt;
2323
2324 while (len >= sizeof(opt)) {
2325 if (copy_from_user(&opt, optbuf, sizeof(opt)))
2326 return -EFAULT;
2327
2328 optbuf++;
2329 len -= sizeof(opt);
2330
2331 switch (opt.opt_code) {
2332 case TCPOPT_MSS:
2333 tp->rx_opt.mss_clamp = opt.opt_val;
2334 break;
2335 case TCPOPT_WINDOW:
2336 {
2337 u16 snd_wscale = opt.opt_val & 0xFFFF;
2338 u16 rcv_wscale = opt.opt_val >> 16;
2339
2340 if (snd_wscale > 14 || rcv_wscale > 14)
2341 return -EFBIG;
2342
2343 tp->rx_opt.snd_wscale = snd_wscale;
2344 tp->rx_opt.rcv_wscale = rcv_wscale;
2345 tp->rx_opt.wscale_ok = 1;
2346 }
2347 break;
2348 case TCPOPT_SACK_PERM:
2349 if (opt.opt_val != 0)
2350 return -EINVAL;
2351
2352 tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
2353 if (sysctl_tcp_fack)
2354 tcp_enable_fack(tp);
2355 break;
2356 case TCPOPT_TIMESTAMP:
2357 if (opt.opt_val != 0)
2358 return -EINVAL;
2359
2360 tp->rx_opt.tstamp_ok = 1;
2361 break;
2362 }
2363 }
2364
2365 return 0;
2366}
2367
2368/* 2129/*
2369 * Socket option code for TCP. 2130 * Socket option code for TCP.
2370 */ 2131 */
@@ -2535,55 +2296,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2535 err = -EINVAL; 2296 err = -EINVAL;
2536 else 2297 else
2537 tp->thin_dupack = val; 2298 tp->thin_dupack = val;
2538 if (tp->thin_dupack)
2539 tcp_disable_early_retrans(tp);
2540 break;
2541
2542 case TCP_REPAIR:
2543 if (!tcp_can_repair_sock(sk))
2544 err = -EPERM;
2545 else if (val == 1) {
2546 tp->repair = 1;
2547 sk->sk_reuse = SK_FORCE_REUSE;
2548 tp->repair_queue = TCP_NO_QUEUE;
2549 } else if (val == 0) {
2550 tp->repair = 0;
2551 sk->sk_reuse = SK_NO_REUSE;
2552 tcp_send_window_probe(sk);
2553 } else
2554 err = -EINVAL;
2555
2556 break;
2557
2558 case TCP_REPAIR_QUEUE:
2559 if (!tp->repair)
2560 err = -EPERM;
2561 else if (val < TCP_QUEUES_NR)
2562 tp->repair_queue = val;
2563 else
2564 err = -EINVAL;
2565 break;
2566
2567 case TCP_QUEUE_SEQ:
2568 if (sk->sk_state != TCP_CLOSE)
2569 err = -EPERM;
2570 else if (tp->repair_queue == TCP_SEND_QUEUE)
2571 tp->write_seq = val;
2572 else if (tp->repair_queue == TCP_RECV_QUEUE)
2573 tp->rcv_nxt = val;
2574 else
2575 err = -EINVAL;
2576 break;
2577
2578 case TCP_REPAIR_OPTIONS:
2579 if (!tp->repair)
2580 err = -EINVAL;
2581 else if (sk->sk_state == TCP_ESTABLISHED)
2582 err = tcp_repair_options_est(tp,
2583 (struct tcp_repair_opt __user *)optval,
2584 optlen);
2585 else
2586 err = -EPERM;
2587 break; 2299 break;
2588 2300
2589 case TCP_CORK: 2301 case TCP_CORK:
@@ -2698,18 +2410,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2698 /* Cap the max timeout in ms TCP will retry/retrans 2410 /* Cap the max timeout in ms TCP will retry/retrans
2699 * before giving up and aborting (ETIMEDOUT) a connection. 2411 * before giving up and aborting (ETIMEDOUT) a connection.
2700 */ 2412 */
2701 if (val < 0) 2413 icsk->icsk_user_timeout = msecs_to_jiffies(val);
2702 err = -EINVAL;
2703 else
2704 icsk->icsk_user_timeout = msecs_to_jiffies(val);
2705 break;
2706
2707 case TCP_FASTOPEN:
2708 if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
2709 TCPF_LISTEN)))
2710 err = fastopen_init_queue(sk, val);
2711 else
2712 err = -EINVAL;
2713 break; 2414 break;
2714 default: 2415 default:
2715 err = -ENOPROTOOPT; 2416 err = -ENOPROTOOPT;
@@ -2723,7 +2424,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2723int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, 2424int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2724 unsigned int optlen) 2425 unsigned int optlen)
2725{ 2426{
2726 const struct inet_connection_sock *icsk = inet_csk(sk); 2427 struct inet_connection_sock *icsk = inet_csk(sk);
2727 2428
2728 if (level != SOL_TCP) 2429 if (level != SOL_TCP)
2729 return icsk->icsk_af_ops->setsockopt(sk, level, optname, 2430 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
@@ -2745,9 +2446,9 @@ EXPORT_SYMBOL(compat_tcp_setsockopt);
2745#endif 2446#endif
2746 2447
2747/* Return information about state of tcp endpoint in API format. */ 2448/* Return information about state of tcp endpoint in API format. */
2748void tcp_get_info(const struct sock *sk, struct tcp_info *info) 2449void tcp_get_info(struct sock *sk, struct tcp_info *info)
2749{ 2450{
2750 const struct tcp_sock *tp = tcp_sk(sk); 2451 struct tcp_sock *tp = tcp_sk(sk);
2751 const struct inet_connection_sock *icsk = inet_csk(sk); 2452 const struct inet_connection_sock *icsk = inet_csk(sk);
2752 u32 now = tcp_time_stamp; 2453 u32 now = tcp_time_stamp;
2753 2454
@@ -2769,12 +2470,8 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info)
2769 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale; 2470 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2770 } 2471 }
2771 2472
2772 if (tp->ecn_flags & TCP_ECN_OK) 2473 if (tp->ecn_flags&TCP_ECN_OK)
2773 info->tcpi_options |= TCPI_OPT_ECN; 2474 info->tcpi_options |= TCPI_OPT_ECN;
2774 if (tp->ecn_flags & TCP_ECN_SEEN)
2775 info->tcpi_options |= TCPI_OPT_ECN_SEEN;
2776 if (tp->syn_data_acked)
2777 info->tcpi_options |= TCPI_OPT_SYN_DATA;
2778 2475
2779 info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); 2476 info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
2780 info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato); 2477 info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
@@ -2832,8 +2529,6 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
2832 val = tp->mss_cache; 2529 val = tp->mss_cache;
2833 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) 2530 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2834 val = tp->rx_opt.user_mss; 2531 val = tp->rx_opt.user_mss;
2835 if (tp->repair)
2836 val = tp->rx_opt.mss_clamp;
2837 break; 2532 break;
2838 case TCP_NODELAY: 2533 case TCP_NODELAY:
2839 val = !!(tp->nonagle&TCP_NAGLE_OFF); 2534 val = !!(tp->nonagle&TCP_NAGLE_OFF);
@@ -2936,26 +2631,6 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
2936 val = tp->thin_dupack; 2631 val = tp->thin_dupack;
2937 break; 2632 break;
2938 2633
2939 case TCP_REPAIR:
2940 val = tp->repair;
2941 break;
2942
2943 case TCP_REPAIR_QUEUE:
2944 if (tp->repair)
2945 val = tp->repair_queue;
2946 else
2947 return -EINVAL;
2948 break;
2949
2950 case TCP_QUEUE_SEQ:
2951 if (tp->repair_queue == TCP_SEND_QUEUE)
2952 val = tp->write_seq;
2953 else if (tp->repair_queue == TCP_RECV_QUEUE)
2954 val = tp->rcv_nxt;
2955 else
2956 return -EINVAL;
2957 break;
2958
2959 case TCP_USER_TIMEOUT: 2634 case TCP_USER_TIMEOUT:
2960 val = jiffies_to_msecs(icsk->icsk_user_timeout); 2635 val = jiffies_to_msecs(icsk->icsk_user_timeout);
2961 break; 2636 break;
@@ -2994,12 +2669,11 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2994EXPORT_SYMBOL(compat_tcp_getsockopt); 2669EXPORT_SYMBOL(compat_tcp_getsockopt);
2995#endif 2670#endif
2996 2671
2997struct sk_buff *tcp_tso_segment(struct sk_buff *skb, 2672struct sk_buff *tcp_tso_segment(struct sk_buff *skb, u32 features)
2998 netdev_features_t features)
2999{ 2673{
3000 struct sk_buff *segs = ERR_PTR(-EINVAL); 2674 struct sk_buff *segs = ERR_PTR(-EINVAL);
3001 struct tcphdr *th; 2675 struct tcphdr *th;
3002 unsigned int thlen; 2676 unsigned thlen;
3003 unsigned int seq; 2677 unsigned int seq;
3004 __be32 delta; 2678 __be32 delta;
3005 unsigned int oldlen; 2679 unsigned int oldlen;
@@ -3198,25 +2872,26 @@ EXPORT_SYMBOL(tcp_gro_complete);
3198 2872
3199#ifdef CONFIG_TCP_MD5SIG 2873#ifdef CONFIG_TCP_MD5SIG
3200static unsigned long tcp_md5sig_users; 2874static unsigned long tcp_md5sig_users;
3201static struct tcp_md5sig_pool __percpu *tcp_md5sig_pool; 2875static struct tcp_md5sig_pool * __percpu *tcp_md5sig_pool;
3202static DEFINE_SPINLOCK(tcp_md5sig_pool_lock); 2876static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
3203 2877
3204static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool __percpu *pool) 2878static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool * __percpu *pool)
3205{ 2879{
3206 int cpu; 2880 int cpu;
3207
3208 for_each_possible_cpu(cpu) { 2881 for_each_possible_cpu(cpu) {
3209 struct tcp_md5sig_pool *p = per_cpu_ptr(pool, cpu); 2882 struct tcp_md5sig_pool *p = *per_cpu_ptr(pool, cpu);
3210 2883 if (p) {
3211 if (p->md5_desc.tfm) 2884 if (p->md5_desc.tfm)
3212 crypto_free_hash(p->md5_desc.tfm); 2885 crypto_free_hash(p->md5_desc.tfm);
2886 kfree(p);
2887 }
3213 } 2888 }
3214 free_percpu(pool); 2889 free_percpu(pool);
3215} 2890}
3216 2891
3217void tcp_free_md5sig_pool(void) 2892void tcp_free_md5sig_pool(void)
3218{ 2893{
3219 struct tcp_md5sig_pool __percpu *pool = NULL; 2894 struct tcp_md5sig_pool * __percpu *pool = NULL;
3220 2895
3221 spin_lock_bh(&tcp_md5sig_pool_lock); 2896 spin_lock_bh(&tcp_md5sig_pool_lock);
3222 if (--tcp_md5sig_users == 0) { 2897 if (--tcp_md5sig_users == 0) {
@@ -3229,24 +2904,30 @@ void tcp_free_md5sig_pool(void)
3229} 2904}
3230EXPORT_SYMBOL(tcp_free_md5sig_pool); 2905EXPORT_SYMBOL(tcp_free_md5sig_pool);
3231 2906
3232static struct tcp_md5sig_pool __percpu * 2907static struct tcp_md5sig_pool * __percpu *
3233__tcp_alloc_md5sig_pool(struct sock *sk) 2908__tcp_alloc_md5sig_pool(struct sock *sk)
3234{ 2909{
3235 int cpu; 2910 int cpu;
3236 struct tcp_md5sig_pool __percpu *pool; 2911 struct tcp_md5sig_pool * __percpu *pool;
3237 2912
3238 pool = alloc_percpu(struct tcp_md5sig_pool); 2913 pool = alloc_percpu(struct tcp_md5sig_pool *);
3239 if (!pool) 2914 if (!pool)
3240 return NULL; 2915 return NULL;
3241 2916
3242 for_each_possible_cpu(cpu) { 2917 for_each_possible_cpu(cpu) {
2918 struct tcp_md5sig_pool *p;
3243 struct crypto_hash *hash; 2919 struct crypto_hash *hash;
3244 2920
2921 p = kzalloc(sizeof(*p), sk->sk_allocation);
2922 if (!p)
2923 goto out_free;
2924 *per_cpu_ptr(pool, cpu) = p;
2925
3245 hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); 2926 hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
3246 if (!hash || IS_ERR(hash)) 2927 if (!hash || IS_ERR(hash))
3247 goto out_free; 2928 goto out_free;
3248 2929
3249 per_cpu_ptr(pool, cpu)->md5_desc.tfm = hash; 2930 p->md5_desc.tfm = hash;
3250 } 2931 }
3251 return pool; 2932 return pool;
3252out_free: 2933out_free:
@@ -3254,16 +2935,16 @@ out_free:
3254 return NULL; 2935 return NULL;
3255} 2936}
3256 2937
3257struct tcp_md5sig_pool __percpu *tcp_alloc_md5sig_pool(struct sock *sk) 2938struct tcp_md5sig_pool * __percpu *tcp_alloc_md5sig_pool(struct sock *sk)
3258{ 2939{
3259 struct tcp_md5sig_pool __percpu *pool; 2940 struct tcp_md5sig_pool * __percpu *pool;
3260 bool alloc = false; 2941 int alloc = 0;
3261 2942
3262retry: 2943retry:
3263 spin_lock_bh(&tcp_md5sig_pool_lock); 2944 spin_lock_bh(&tcp_md5sig_pool_lock);
3264 pool = tcp_md5sig_pool; 2945 pool = tcp_md5sig_pool;
3265 if (tcp_md5sig_users++ == 0) { 2946 if (tcp_md5sig_users++ == 0) {
3266 alloc = true; 2947 alloc = 1;
3267 spin_unlock_bh(&tcp_md5sig_pool_lock); 2948 spin_unlock_bh(&tcp_md5sig_pool_lock);
3268 } else if (!pool) { 2949 } else if (!pool) {
3269 tcp_md5sig_users--; 2950 tcp_md5sig_users--;
@@ -3275,7 +2956,7 @@ retry:
3275 2956
3276 if (alloc) { 2957 if (alloc) {
3277 /* we cannot hold spinlock here because this may sleep. */ 2958 /* we cannot hold spinlock here because this may sleep. */
3278 struct tcp_md5sig_pool __percpu *p; 2959 struct tcp_md5sig_pool * __percpu *p;
3279 2960
3280 p = __tcp_alloc_md5sig_pool(sk); 2961 p = __tcp_alloc_md5sig_pool(sk);
3281 spin_lock_bh(&tcp_md5sig_pool_lock); 2962 spin_lock_bh(&tcp_md5sig_pool_lock);
@@ -3308,7 +2989,7 @@ EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
3308 */ 2989 */
3309struct tcp_md5sig_pool *tcp_get_md5sig_pool(void) 2990struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
3310{ 2991{
3311 struct tcp_md5sig_pool __percpu *p; 2992 struct tcp_md5sig_pool * __percpu *p;
3312 2993
3313 local_bh_disable(); 2994 local_bh_disable();
3314 2995
@@ -3319,7 +3000,7 @@ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
3319 spin_unlock(&tcp_md5sig_pool_lock); 3000 spin_unlock(&tcp_md5sig_pool_lock);
3320 3001
3321 if (p) 3002 if (p)
3322 return this_cpu_ptr(p); 3003 return *this_cpu_ptr(p);
3323 3004
3324 local_bh_enable(); 3005 local_bh_enable();
3325 return NULL; 3006 return NULL;
@@ -3334,32 +3015,30 @@ void tcp_put_md5sig_pool(void)
3334EXPORT_SYMBOL(tcp_put_md5sig_pool); 3015EXPORT_SYMBOL(tcp_put_md5sig_pool);
3335 3016
3336int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, 3017int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
3337 const struct tcphdr *th) 3018 struct tcphdr *th)
3338{ 3019{
3339 struct scatterlist sg; 3020 struct scatterlist sg;
3340 struct tcphdr hdr;
3341 int err; 3021 int err;
3342 3022
3343 /* We are not allowed to change tcphdr, make a local copy */ 3023 __sum16 old_checksum = th->check;
3344 memcpy(&hdr, th, sizeof(hdr)); 3024 th->check = 0;
3345 hdr.check = 0;
3346
3347 /* options aren't included in the hash */ 3025 /* options aren't included in the hash */
3348 sg_init_one(&sg, &hdr, sizeof(hdr)); 3026 sg_init_one(&sg, th, sizeof(struct tcphdr));
3349 err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(hdr)); 3027 err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(struct tcphdr));
3028 th->check = old_checksum;
3350 return err; 3029 return err;
3351} 3030}
3352EXPORT_SYMBOL(tcp_md5_hash_header); 3031EXPORT_SYMBOL(tcp_md5_hash_header);
3353 3032
3354int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, 3033int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
3355 const struct sk_buff *skb, unsigned int header_len) 3034 struct sk_buff *skb, unsigned header_len)
3356{ 3035{
3357 struct scatterlist sg; 3036 struct scatterlist sg;
3358 const struct tcphdr *tp = tcp_hdr(skb); 3037 const struct tcphdr *tp = tcp_hdr(skb);
3359 struct hash_desc *desc = &hp->md5_desc; 3038 struct hash_desc *desc = &hp->md5_desc;
3360 unsigned int i; 3039 unsigned i;
3361 const unsigned int head_data_len = skb_headlen(skb) > header_len ? 3040 const unsigned head_data_len = skb_headlen(skb) > header_len ?
3362 skb_headlen(skb) - header_len : 0; 3041 skb_headlen(skb) - header_len : 0;
3363 const struct skb_shared_info *shi = skb_shinfo(skb); 3042 const struct skb_shared_info *shi = skb_shinfo(skb);
3364 struct sk_buff *frag_iter; 3043 struct sk_buff *frag_iter;
3365 3044
@@ -3371,9 +3050,8 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
3371 3050
3372 for (i = 0; i < shi->nr_frags; ++i) { 3051 for (i = 0; i < shi->nr_frags; ++i) {
3373 const struct skb_frag_struct *f = &shi->frags[i]; 3052 const struct skb_frag_struct *f = &shi->frags[i];
3374 struct page *page = skb_frag_page(f); 3053 sg_set_page(&sg, f->page, f->size, f->page_offset);
3375 sg_set_page(&sg, page, skb_frag_size(f), f->page_offset); 3054 if (crypto_hash_update(desc, &sg, f->size))
3376 if (crypto_hash_update(desc, &sg, skb_frag_size(f)))
3377 return 1; 3055 return 1;
3378 } 3056 }
3379 3057
@@ -3385,7 +3063,7 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
3385} 3063}
3386EXPORT_SYMBOL(tcp_md5_hash_skb_data); 3064EXPORT_SYMBOL(tcp_md5_hash_skb_data);
3387 3065
3388int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key) 3066int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key)
3389{ 3067{
3390 struct scatterlist sg; 3068 struct scatterlist sg;
3391 3069
@@ -3396,7 +3074,8 @@ EXPORT_SYMBOL(tcp_md5_hash_key);
3396 3074
3397#endif 3075#endif
3398 3076
3399/* Each Responder maintains up to two secret values concurrently for 3077/**
3078 * Each Responder maintains up to two secret values concurrently for
3400 * efficient secret rollover. Each secret value has 4 states: 3079 * efficient secret rollover. Each secret value has 4 states:
3401 * 3080 *
3402 * Generating. (tcp_secret_generating != tcp_secret_primary) 3081 * Generating. (tcp_secret_generating != tcp_secret_primary)
@@ -3526,15 +3205,11 @@ EXPORT_SYMBOL(tcp_cookie_generator);
3526 3205
3527void tcp_done(struct sock *sk) 3206void tcp_done(struct sock *sk)
3528{ 3207{
3529 struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
3530
3531 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) 3208 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
3532 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); 3209 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
3533 3210
3534 tcp_set_state(sk, TCP_CLOSE); 3211 tcp_set_state(sk, TCP_CLOSE);
3535 tcp_clear_xmit_timers(sk); 3212 tcp_clear_xmit_timers(sk);
3536 if (req != NULL)
3537 reqsk_fastopen_remove(sk, req, false);
3538 3213
3539 sk->sk_shutdown = SHUTDOWN_MASK; 3214 sk->sk_shutdown = SHUTDOWN_MASK;
3540 3215
@@ -3550,34 +3225,18 @@ extern struct tcp_congestion_ops tcp_reno;
3550static __initdata unsigned long thash_entries; 3225static __initdata unsigned long thash_entries;
3551static int __init set_thash_entries(char *str) 3226static int __init set_thash_entries(char *str)
3552{ 3227{
3553 ssize_t ret;
3554
3555 if (!str) 3228 if (!str)
3556 return 0; 3229 return 0;
3557 3230 thash_entries = simple_strtoul(str, &str, 0);
3558 ret = kstrtoul(str, 0, &thash_entries);
3559 if (ret)
3560 return 0;
3561
3562 return 1; 3231 return 1;
3563} 3232}
3564__setup("thash_entries=", set_thash_entries); 3233__setup("thash_entries=", set_thash_entries);
3565 3234
3566void tcp_init_mem(struct net *net)
3567{
3568 unsigned long limit = nr_free_buffer_pages() / 8;
3569 limit = max(limit, 128UL);
3570 net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
3571 net->ipv4.sysctl_tcp_mem[1] = limit;
3572 net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
3573}
3574
3575void __init tcp_init(void) 3235void __init tcp_init(void)
3576{ 3236{
3577 struct sk_buff *skb = NULL; 3237 struct sk_buff *skb = NULL;
3578 unsigned long limit; 3238 unsigned long limit;
3579 int max_rshare, max_wshare, cnt; 3239 int i, max_share, cnt;
3580 unsigned int i;
3581 unsigned long jiffy = jiffies; 3240 unsigned long jiffy = jiffies;
3582 3241
3583 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); 3242 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
@@ -3598,11 +3257,11 @@ void __init tcp_init(void)
3598 alloc_large_system_hash("TCP established", 3257 alloc_large_system_hash("TCP established",
3599 sizeof(struct inet_ehash_bucket), 3258 sizeof(struct inet_ehash_bucket),
3600 thash_entries, 3259 thash_entries,
3601 17, /* one slot per 128 KB of memory */ 3260 (totalram_pages >= 128 * 1024) ?
3261 13 : 15,
3602 0, 3262 0,
3603 NULL, 3263 NULL,
3604 &tcp_hashinfo.ehash_mask, 3264 &tcp_hashinfo.ehash_mask,
3605 0,
3606 thash_entries ? 0 : 512 * 1024); 3265 thash_entries ? 0 : 512 * 1024);
3607 for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) { 3266 for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) {
3608 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); 3267 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
@@ -3614,13 +3273,13 @@ void __init tcp_init(void)
3614 alloc_large_system_hash("TCP bind", 3273 alloc_large_system_hash("TCP bind",
3615 sizeof(struct inet_bind_hashbucket), 3274 sizeof(struct inet_bind_hashbucket),
3616 tcp_hashinfo.ehash_mask + 1, 3275 tcp_hashinfo.ehash_mask + 1,
3617 17, /* one slot per 128 KB of memory */ 3276 (totalram_pages >= 128 * 1024) ?
3277 13 : 15,
3618 0, 3278 0,
3619 &tcp_hashinfo.bhash_size, 3279 &tcp_hashinfo.bhash_size,
3620 NULL, 3280 NULL,
3621 0,
3622 64 * 1024); 3281 64 * 1024);
3623 tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size; 3282 tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
3624 for (i = 0; i < tcp_hashinfo.bhash_size; i++) { 3283 for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
3625 spin_lock_init(&tcp_hashinfo.bhash[i].lock); 3284 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
3626 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); 3285 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
@@ -3633,24 +3292,27 @@ void __init tcp_init(void)
3633 sysctl_tcp_max_orphans = cnt / 2; 3292 sysctl_tcp_max_orphans = cnt / 2;
3634 sysctl_max_syn_backlog = max(128, cnt / 256); 3293 sysctl_max_syn_backlog = max(128, cnt / 256);
3635 3294
3636 tcp_init_mem(&init_net); 3295 limit = nr_free_buffer_pages() / 8;
3296 limit = max(limit, 128UL);
3297 sysctl_tcp_mem[0] = limit / 4 * 3;
3298 sysctl_tcp_mem[1] = limit;
3299 sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
3300
3637 /* Set per-socket limits to no more than 1/128 the pressure threshold */ 3301 /* Set per-socket limits to no more than 1/128 the pressure threshold */
3638 limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); 3302 limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
3639 max_wshare = min(4UL*1024*1024, limit); 3303 max_share = min(4UL*1024*1024, limit);
3640 max_rshare = min(6UL*1024*1024, limit);
3641 3304
3642 sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; 3305 sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
3643 sysctl_tcp_wmem[1] = 16*1024; 3306 sysctl_tcp_wmem[1] = 16*1024;
3644 sysctl_tcp_wmem[2] = max(64*1024, max_wshare); 3307 sysctl_tcp_wmem[2] = max(64*1024, max_share);
3645 3308
3646 sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; 3309 sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
3647 sysctl_tcp_rmem[1] = 87380; 3310 sysctl_tcp_rmem[1] = 87380;
3648 sysctl_tcp_rmem[2] = max(87380, max_rshare); 3311 sysctl_tcp_rmem[2] = max(87380, max_share);
3649 3312
3650 pr_info("Hash tables configured (established %u bind %u)\n", 3313 printk(KERN_INFO "TCP: Hash tables configured "
3651 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); 3314 "(established %u bind %u)\n",
3652 3315 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
3653 tcp_metrics_init();
3654 3316
3655 tcp_register_congestion_control(&tcp_reno); 3317 tcp_register_congestion_control(&tcp_reno);
3656 3318
@@ -3662,5 +3324,108 @@ void __init tcp_init(void)
3662 tcp_secret_primary = &tcp_secret_one; 3324 tcp_secret_primary = &tcp_secret_one;
3663 tcp_secret_retiring = &tcp_secret_two; 3325 tcp_secret_retiring = &tcp_secret_two;
3664 tcp_secret_secondary = &tcp_secret_two; 3326 tcp_secret_secondary = &tcp_secret_two;
3665 tcp_tasklet_init(); 3327}
3328
3329static int tcp_is_local(struct net *net, __be32 addr) {
3330 struct rtable *rt;
3331 struct flowi4 fl4 = { .daddr = addr };
3332 rt = ip_route_output_key(net, &fl4);
3333 if (IS_ERR_OR_NULL(rt))
3334 return 0;
3335 return rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK);
3336}
3337
3338#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
3339static int tcp_is_local6(struct net *net, struct in6_addr *addr) {
3340 struct rt6_info *rt6 = rt6_lookup(net, addr, addr, 0, 0);
3341 return rt6 && rt6->rt6i_dev && (rt6->rt6i_dev->flags & IFF_LOOPBACK);
3342}
3343#endif
3344
3345/*
3346 * tcp_nuke_addr - destroy all sockets on the given local address
3347 * if local address is the unspecified address (0.0.0.0 or ::), destroy all
3348 * sockets with local addresses that are not configured.
3349 */
3350int tcp_nuke_addr(struct net *net, struct sockaddr *addr)
3351{
3352 int family = addr->sa_family;
3353 unsigned int bucket;
3354
3355 struct in_addr *in;
3356#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
3357 struct in6_addr *in6;
3358#endif
3359 if (family == AF_INET) {
3360 in = &((struct sockaddr_in *)addr)->sin_addr;
3361#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
3362 } else if (family == AF_INET6) {
3363 in6 = &((struct sockaddr_in6 *)addr)->sin6_addr;
3364#endif
3365 } else {
3366 return -EAFNOSUPPORT;
3367 }
3368
3369 for (bucket = 0; bucket < tcp_hashinfo.ehash_mask; bucket++) {
3370 struct hlist_nulls_node *node;
3371 struct sock *sk;
3372 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, bucket);
3373
3374restart:
3375 spin_lock_bh(lock);
3376 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[bucket].chain) {
3377 struct inet_sock *inet = inet_sk(sk);
3378
3379 if (sysctl_ip_dynaddr && sk->sk_state == TCP_SYN_SENT)
3380 continue;
3381 if (sock_flag(sk, SOCK_DEAD))
3382 continue;
3383
3384 if (family == AF_INET) {
3385 __be32 s4 = inet->inet_rcv_saddr;
3386 if (s4 == LOOPBACK4_IPV6)
3387 continue;
3388
3389 if (in->s_addr != s4 &&
3390 !(in->s_addr == INADDR_ANY &&
3391 !tcp_is_local(net, s4)))
3392 continue;
3393 }
3394
3395#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
3396 if (family == AF_INET6) {
3397 struct in6_addr *s6;
3398 if (!inet->pinet6)
3399 continue;
3400
3401 s6 = &inet->pinet6->rcv_saddr;
3402 if (ipv6_addr_type(s6) == IPV6_ADDR_MAPPED)
3403 continue;
3404
3405 if (!ipv6_addr_equal(in6, s6) &&
3406 !(ipv6_addr_equal(in6, &in6addr_any) &&
3407 !tcp_is_local6(net, s6)))
3408 continue;
3409 }
3410#endif
3411
3412 sock_hold(sk);
3413 spin_unlock_bh(lock);
3414
3415 local_bh_disable();
3416 bh_lock_sock(sk);
3417 sk->sk_err = ETIMEDOUT;
3418 sk->sk_error_report(sk);
3419
3420 tcp_done(sk);
3421 bh_unlock_sock(sk);
3422 local_bh_enable();
3423 sock_put(sk);
3424
3425 goto restart;
3426 }
3427 spin_unlock_bh(lock);
3428 }
3429
3430 return 0;
3666} 3431}
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index f45e1c24244..6187eb4d1dc 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -63,6 +63,7 @@ static inline void bictcp_reset(struct bictcp *ca)
63{ 63{
64 ca->cnt = 0; 64 ca->cnt = 0;
65 ca->last_max_cwnd = 0; 65 ca->last_max_cwnd = 0;
66 ca->loss_cwnd = 0;
66 ca->last_cwnd = 0; 67 ca->last_cwnd = 0;
67 ca->last_time = 0; 68 ca->last_time = 0;
68 ca->epoch_start = 0; 69 ca->epoch_start = 0;
@@ -71,11 +72,7 @@ static inline void bictcp_reset(struct bictcp *ca)
71 72
72static void bictcp_init(struct sock *sk) 73static void bictcp_init(struct sock *sk)
73{ 74{
74 struct bictcp *ca = inet_csk_ca(sk); 75 bictcp_reset(inet_csk_ca(sk));
75
76 bictcp_reset(ca);
77 ca->loss_cwnd = 0;
78
79 if (initial_ssthresh) 76 if (initial_ssthresh)
80 tcp_sk(sk)->snd_ssthresh = initial_ssthresh; 77 tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
81} 78}
@@ -130,7 +127,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
130 } 127 }
131 128
132 /* if in slow start or link utilization is very low */ 129 /* if in slow start or link utilization is very low */
133 if (ca->last_max_cwnd == 0) { 130 if (ca->loss_cwnd == 0) {
134 if (ca->cnt > 20) /* increase cwnd 5% per RTT */ 131 if (ca->cnt > 20) /* increase cwnd 5% per RTT */
135 ca->cnt = 20; 132 ca->cnt = 20;
136 } 133 }
@@ -188,7 +185,7 @@ static u32 bictcp_undo_cwnd(struct sock *sk)
188{ 185{
189 const struct tcp_sock *tp = tcp_sk(sk); 186 const struct tcp_sock *tp = tcp_sk(sk);
190 const struct bictcp *ca = inet_csk_ca(sk); 187 const struct bictcp *ca = inet_csk_ca(sk);
191 return max(tp->snd_cwnd, ca->loss_cwnd); 188 return max(tp->snd_cwnd, ca->last_max_cwnd);
192} 189}
193 190
194static void bictcp_state(struct sock *sk, u8 new_state) 191static void bictcp_state(struct sock *sk, u8 new_state)
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 291f2ed7cc3..850c737e08e 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -1,13 +1,11 @@
1/* 1/*
2 * Plugable TCP congestion control support and newReno 2 * Plugable TCP congestion control support and newReno
3 * congestion control. 3 * congestion control.
4 * Based on ideas from I/O scheduler support and Web100. 4 * Based on ideas from I/O scheduler suport and Web100.
5 * 5 *
6 * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org> 6 * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
7 */ 7 */
8 8
9#define pr_fmt(fmt) "TCP: " fmt
10
11#include <linux/module.h> 9#include <linux/module.h>
12#include <linux/mm.h> 10#include <linux/mm.h>
13#include <linux/types.h> 11#include <linux/types.h>
@@ -43,17 +41,18 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
43 41
44 /* all algorithms must implement ssthresh and cong_avoid ops */ 42 /* all algorithms must implement ssthresh and cong_avoid ops */
45 if (!ca->ssthresh || !ca->cong_avoid) { 43 if (!ca->ssthresh || !ca->cong_avoid) {
46 pr_err("%s does not implement required ops\n", ca->name); 44 printk(KERN_ERR "TCP %s does not implement required ops\n",
45 ca->name);
47 return -EINVAL; 46 return -EINVAL;
48 } 47 }
49 48
50 spin_lock(&tcp_cong_list_lock); 49 spin_lock(&tcp_cong_list_lock);
51 if (tcp_ca_find(ca->name)) { 50 if (tcp_ca_find(ca->name)) {
52 pr_notice("%s already registered\n", ca->name); 51 printk(KERN_NOTICE "TCP %s already registered\n", ca->name);
53 ret = -EEXIST; 52 ret = -EEXIST;
54 } else { 53 } else {
55 list_add_tail_rcu(&ca->list, &tcp_cong_list); 54 list_add_tail_rcu(&ca->list, &tcp_cong_list);
56 pr_info("%s registered\n", ca->name); 55 printk(KERN_INFO "TCP %s registered\n", ca->name);
57 } 56 }
58 spin_unlock(&tcp_cong_list_lock); 57 spin_unlock(&tcp_cong_list_lock);
59 58
@@ -259,8 +258,7 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
259 if (!ca) 258 if (!ca)
260 err = -ENOENT; 259 err = -ENOENT;
261 260
262 else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || 261 else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || capable(CAP_NET_ADMIN)))
263 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)))
264 err = -EPERM; 262 err = -EPERM;
265 263
266 else if (!try_module_get(ca->owner)) 264 else if (!try_module_get(ca->owner))
@@ -281,21 +279,20 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
281/* RFC2861 Check whether we are limited by application or congestion window 279/* RFC2861 Check whether we are limited by application or congestion window
282 * This is the inverse of cwnd check in tcp_tso_should_defer 280 * This is the inverse of cwnd check in tcp_tso_should_defer
283 */ 281 */
284bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) 282int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
285{ 283{
286 const struct tcp_sock *tp = tcp_sk(sk); 284 const struct tcp_sock *tp = tcp_sk(sk);
287 u32 left; 285 u32 left;
288 286
289 if (in_flight >= tp->snd_cwnd) 287 if (in_flight >= tp->snd_cwnd)
290 return true; 288 return 1;
291 289
292 left = tp->snd_cwnd - in_flight; 290 left = tp->snd_cwnd - in_flight;
293 if (sk_can_gso(sk) && 291 if (sk_can_gso(sk) &&
294 left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd && 292 left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd &&
295 left * tp->mss_cache < sk->sk_gso_max_size && 293 left * tp->mss_cache < sk->sk_gso_max_size)
296 left < sk->sk_gso_max_segs) 294 return 1;
297 return true; 295 return left <= tcp_max_burst(tp);
298 return left <= tcp_max_tso_deferred_mss(tp);
299} 296}
300EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited); 297EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited);
301 298
@@ -309,7 +306,6 @@ EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited);
309void tcp_slow_start(struct tcp_sock *tp) 306void tcp_slow_start(struct tcp_sock *tp)
310{ 307{
311 int cnt; /* increase in packets */ 308 int cnt; /* increase in packets */
312 unsigned int delta = 0;
313 309
314 /* RFC3465: ABC Slow start 310 /* RFC3465: ABC Slow start
315 * Increase only after a full MSS of bytes is acked 311 * Increase only after a full MSS of bytes is acked
@@ -336,9 +332,9 @@ void tcp_slow_start(struct tcp_sock *tp)
336 tp->snd_cwnd_cnt += cnt; 332 tp->snd_cwnd_cnt += cnt;
337 while (tp->snd_cwnd_cnt >= tp->snd_cwnd) { 333 while (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
338 tp->snd_cwnd_cnt -= tp->snd_cwnd; 334 tp->snd_cwnd_cnt -= tp->snd_cwnd;
339 delta++; 335 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
336 tp->snd_cwnd++;
340 } 337 }
341 tp->snd_cwnd = min(tp->snd_cwnd + delta, tp->snd_cwnd_clamp);
342} 338}
343EXPORT_SYMBOL_GPL(tcp_slow_start); 339EXPORT_SYMBOL_GPL(tcp_slow_start);
344 340
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index a9077f441cb..f376b05cca8 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -107,6 +107,7 @@ static inline void bictcp_reset(struct bictcp *ca)
107{ 107{
108 ca->cnt = 0; 108 ca->cnt = 0;
109 ca->last_max_cwnd = 0; 109 ca->last_max_cwnd = 0;
110 ca->loss_cwnd = 0;
110 ca->last_cwnd = 0; 111 ca->last_cwnd = 0;
111 ca->last_time = 0; 112 ca->last_time = 0;
112 ca->bic_origin_point = 0; 113 ca->bic_origin_point = 0;
@@ -141,10 +142,7 @@ static inline void bictcp_hystart_reset(struct sock *sk)
141 142
142static void bictcp_init(struct sock *sk) 143static void bictcp_init(struct sock *sk)
143{ 144{
144 struct bictcp *ca = inet_csk_ca(sk); 145 bictcp_reset(inet_csk_ca(sk));
145
146 bictcp_reset(ca);
147 ca->loss_cwnd = 0;
148 146
149 if (hystart) 147 if (hystart)
150 bictcp_hystart_reset(sk); 148 bictcp_hystart_reset(sk);
@@ -277,7 +275,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
277 * The initial growth of cubic function may be too conservative 275 * The initial growth of cubic function may be too conservative
278 * when the available bandwidth is still unknown. 276 * when the available bandwidth is still unknown.
279 */ 277 */
280 if (ca->last_max_cwnd == 0 && ca->cnt > 20) 278 if (ca->loss_cwnd == 0 && ca->cnt > 20)
281 ca->cnt = 20; /* increase cwnd 5% per RTT */ 279 ca->cnt = 20; /* increase cwnd 5% per RTT */
282 280
283 /* TCP Friendly */ 281 /* TCP Friendly */
@@ -344,7 +342,7 @@ static u32 bictcp_undo_cwnd(struct sock *sk)
344{ 342{
345 struct bictcp *ca = inet_csk_ca(sk); 343 struct bictcp *ca = inet_csk_ca(sk);
346 344
347 return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); 345 return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd);
348} 346}
349 347
350static void bictcp_state(struct sock *sk, u8 new_state) 348static void bictcp_state(struct sock *sk, u8 new_state)
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index ed3f2ad42e0..939edb3b8e4 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -34,23 +34,11 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
34 tcp_get_info(sk, info); 34 tcp_get_info(sk, info);
35} 35}
36 36
37static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
38 struct inet_diag_req_v2 *r, struct nlattr *bc)
39{
40 inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc);
41}
42
43static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
44 struct inet_diag_req_v2 *req)
45{
46 return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req);
47}
48
49static const struct inet_diag_handler tcp_diag_handler = { 37static const struct inet_diag_handler tcp_diag_handler = {
50 .dump = tcp_diag_dump, 38 .idiag_hashinfo = &tcp_hashinfo,
51 .dump_one = tcp_diag_dump_one,
52 .idiag_get_info = tcp_diag_get_info, 39 .idiag_get_info = tcp_diag_get_info,
53 .idiag_type = IPPROTO_TCP, 40 .idiag_type = TCPDIAG_GETSOCK,
41 .idiag_info_size = sizeof(struct tcp_info),
54}; 42};
55 43
56static int __init tcp_diag_init(void) 44static int __init tcp_diag_init(void)
@@ -66,4 +54,4 @@ static void __exit tcp_diag_exit(void)
66module_init(tcp_diag_init); 54module_init(tcp_diag_init);
67module_exit(tcp_diag_exit); 55module_exit(tcp_diag_exit);
68MODULE_LICENSE("GPL"); 56MODULE_LICENSE("GPL");
69MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-6 /* AF_INET - IPPROTO_TCP */); 57MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_INET_DIAG, TCPDIAG_GETSOCK);
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
deleted file mode 100644
index 8f7ef0ad80e..00000000000
--- a/net/ipv4/tcp_fastopen.c
+++ /dev/null
@@ -1,92 +0,0 @@
1#include <linux/err.h>
2#include <linux/init.h>
3#include <linux/kernel.h>
4#include <linux/list.h>
5#include <linux/tcp.h>
6#include <linux/rcupdate.h>
7#include <linux/rculist.h>
8#include <net/inetpeer.h>
9#include <net/tcp.h>
10
11int sysctl_tcp_fastopen __read_mostly;
12
13struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
14
15static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock);
16
17static void tcp_fastopen_ctx_free(struct rcu_head *head)
18{
19 struct tcp_fastopen_context *ctx =
20 container_of(head, struct tcp_fastopen_context, rcu);
21 crypto_free_cipher(ctx->tfm);
22 kfree(ctx);
23}
24
25int tcp_fastopen_reset_cipher(void *key, unsigned int len)
26{
27 int err;
28 struct tcp_fastopen_context *ctx, *octx;
29
30 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
31 if (!ctx)
32 return -ENOMEM;
33 ctx->tfm = crypto_alloc_cipher("aes", 0, 0);
34
35 if (IS_ERR(ctx->tfm)) {
36 err = PTR_ERR(ctx->tfm);
37error: kfree(ctx);
38 pr_err("TCP: TFO aes cipher alloc error: %d\n", err);
39 return err;
40 }
41 err = crypto_cipher_setkey(ctx->tfm, key, len);
42 if (err) {
43 pr_err("TCP: TFO cipher key error: %d\n", err);
44 crypto_free_cipher(ctx->tfm);
45 goto error;
46 }
47 memcpy(ctx->key, key, len);
48
49 spin_lock(&tcp_fastopen_ctx_lock);
50
51 octx = rcu_dereference_protected(tcp_fastopen_ctx,
52 lockdep_is_held(&tcp_fastopen_ctx_lock));
53 rcu_assign_pointer(tcp_fastopen_ctx, ctx);
54 spin_unlock(&tcp_fastopen_ctx_lock);
55
56 if (octx)
57 call_rcu(&octx->rcu, tcp_fastopen_ctx_free);
58 return err;
59}
60
61/* Computes the fastopen cookie for the peer.
62 * The peer address is a 128 bits long (pad with zeros for IPv4).
63 *
64 * The caller must check foc->len to determine if a valid cookie
65 * has been generated successfully.
66*/
67void tcp_fastopen_cookie_gen(__be32 addr, struct tcp_fastopen_cookie *foc)
68{
69 __be32 peer_addr[4] = { addr, 0, 0, 0 };
70 struct tcp_fastopen_context *ctx;
71
72 rcu_read_lock();
73 ctx = rcu_dereference(tcp_fastopen_ctx);
74 if (ctx) {
75 crypto_cipher_encrypt_one(ctx->tfm,
76 foc->val,
77 (__u8 *)peer_addr);
78 foc->len = TCP_FASTOPEN_COOKIE_SIZE;
79 }
80 rcu_read_unlock();
81}
82
83static int __init tcp_fastopen_init(void)
84{
85 __u8 key[TCP_FASTOPEN_KEY_LENGTH];
86
87 get_random_bytes(key, sizeof(key));
88 tcp_fastopen_reset_cipher(key, sizeof(key));
89 return 0;
90}
91
92late_initcall(tcp_fastopen_init);
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 57bdd17dff4..fe3ecf484b4 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -15,7 +15,7 @@
15 15
16/* Tcp Hybla structure. */ 16/* Tcp Hybla structure. */
17struct hybla { 17struct hybla {
18 bool hybla_en; 18 u8 hybla_en;
19 u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */ 19 u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */
20 u32 rho; /* Rho parameter, integer part */ 20 u32 rho; /* Rho parameter, integer part */
21 u32 rho2; /* Rho * Rho, integer part */ 21 u32 rho2; /* Rho * Rho, integer part */
@@ -24,7 +24,8 @@ struct hybla {
24 u32 minrtt; /* Minimum smoothed round trip time value seen */ 24 u32 minrtt; /* Minimum smoothed round trip time value seen */
25}; 25};
26 26
27/* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */ 27/* Hybla reference round trip time (default= 1/40 sec = 25 ms),
28 expressed in jiffies */
28static int rtt0 = 25; 29static int rtt0 = 25;
29module_param(rtt0, int, 0644); 30module_param(rtt0, int, 0644);
30MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)"); 31MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
@@ -38,7 +39,7 @@ static inline void hybla_recalc_param (struct sock *sk)
38 ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8); 39 ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8);
39 ca->rho = ca->rho_3ls >> 3; 40 ca->rho = ca->rho_3ls >> 3;
40 ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; 41 ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
41 ca->rho2 = ca->rho2_7ls >> 7; 42 ca->rho2 = ca->rho2_7ls >>7;
42} 43}
43 44
44static void hybla_init(struct sock *sk) 45static void hybla_init(struct sock *sk)
@@ -51,7 +52,7 @@ static void hybla_init(struct sock *sk)
51 ca->rho_3ls = 0; 52 ca->rho_3ls = 0;
52 ca->rho2_7ls = 0; 53 ca->rho2_7ls = 0;
53 ca->snd_cwnd_cents = 0; 54 ca->snd_cwnd_cents = 0;
54 ca->hybla_en = true; 55 ca->hybla_en = 1;
55 tp->snd_cwnd = 2; 56 tp->snd_cwnd = 2;
56 tp->snd_cwnd_clamp = 65535; 57 tp->snd_cwnd_clamp = 65535;
57 58
@@ -66,7 +67,6 @@ static void hybla_init(struct sock *sk)
66static void hybla_state(struct sock *sk, u8 ca_state) 67static void hybla_state(struct sock *sk, u8 ca_state)
67{ 68{
68 struct hybla *ca = inet_csk_ca(sk); 69 struct hybla *ca = inet_csk_ca(sk);
69
70 ca->hybla_en = (ca_state == TCP_CA_Open); 70 ca->hybla_en = (ca_state == TCP_CA_Open);
71} 71}
72 72
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 834857f3c87..813b43a76fe 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -313,13 +313,11 @@ static void tcp_illinois_info(struct sock *sk, u32 ext,
313 .tcpv_rttcnt = ca->cnt_rtt, 313 .tcpv_rttcnt = ca->cnt_rtt,
314 .tcpv_minrtt = ca->base_rtt, 314 .tcpv_minrtt = ca->base_rtt,
315 }; 315 };
316 u64 t = ca->sum_rtt;
316 317
317 if (info.tcpv_rttcnt > 0) { 318 do_div(t, ca->cnt_rtt);
318 u64 t = ca->sum_rtt; 319 info.tcpv_rtt = t;
319 320
320 do_div(t, info.tcpv_rttcnt);
321 info.tcpv_rtt = t;
322 }
323 nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info); 321 nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info);
324 } 322 }
325} 323}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 18f97ca76b0..d73aab3fbfc 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -61,8 +61,6 @@
61 * Pasi Sarolahti: F-RTO for dealing with spurious RTOs 61 * Pasi Sarolahti: F-RTO for dealing with spurious RTOs
62 */ 62 */
63 63
64#define pr_fmt(fmt) "TCP: " fmt
65
66#include <linux/mm.h> 64#include <linux/mm.h>
67#include <linux/slab.h> 65#include <linux/slab.h>
68#include <linux/module.h> 66#include <linux/module.h>
@@ -85,23 +83,20 @@ int sysctl_tcp_ecn __read_mostly = 2;
85EXPORT_SYMBOL(sysctl_tcp_ecn); 83EXPORT_SYMBOL(sysctl_tcp_ecn);
86int sysctl_tcp_dsack __read_mostly = 1; 84int sysctl_tcp_dsack __read_mostly = 1;
87int sysctl_tcp_app_win __read_mostly = 31; 85int sysctl_tcp_app_win __read_mostly = 31;
88int sysctl_tcp_adv_win_scale __read_mostly = 1; 86int sysctl_tcp_adv_win_scale __read_mostly = 2;
89EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); 87EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
90 88
91/* rfc5961 challenge ack rate limiting */
92int sysctl_tcp_challenge_ack_limit = 100;
93
94int sysctl_tcp_stdurg __read_mostly; 89int sysctl_tcp_stdurg __read_mostly;
95int sysctl_tcp_rfc1337 __read_mostly; 90int sysctl_tcp_rfc1337 __read_mostly;
96int sysctl_tcp_max_orphans __read_mostly = NR_FILE; 91int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
97int sysctl_tcp_frto __read_mostly = 2; 92int sysctl_tcp_frto __read_mostly = 2;
98int sysctl_tcp_frto_response __read_mostly; 93int sysctl_tcp_frto_response __read_mostly;
94int sysctl_tcp_nometrics_save __read_mostly;
99 95
100int sysctl_tcp_thin_dupack __read_mostly; 96int sysctl_tcp_thin_dupack __read_mostly;
101 97
102int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; 98int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
103int sysctl_tcp_abc __read_mostly; 99int sysctl_tcp_abc __read_mostly;
104int sysctl_tcp_early_retrans __read_mostly = 2;
105 100
106#define FLAG_DATA 0x01 /* Incoming frame contained data. */ 101#define FLAG_DATA 0x01 /* Incoming frame contained data. */
107#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ 102#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
@@ -110,6 +105,7 @@ int sysctl_tcp_early_retrans __read_mostly = 2;
110#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ 105#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
111#define FLAG_DATA_SACKED 0x20 /* New SACK. */ 106#define FLAG_DATA_SACKED 0x20 /* New SACK. */
112#define FLAG_ECE 0x40 /* ECE in this ACK */ 107#define FLAG_ECE 0x40 /* ECE in this ACK */
108#define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. */
113#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ 109#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
114#define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ 110#define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */
115#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ 111#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
@@ -178,7 +174,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
178static void tcp_incr_quickack(struct sock *sk) 174static void tcp_incr_quickack(struct sock *sk)
179{ 175{
180 struct inet_connection_sock *icsk = inet_csk(sk); 176 struct inet_connection_sock *icsk = inet_csk(sk);
181 unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); 177 unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
182 178
183 if (quickacks == 0) 179 if (quickacks == 0)
184 quickacks = 2; 180 quickacks = 2;
@@ -198,10 +194,9 @@ static void tcp_enter_quickack_mode(struct sock *sk)
198 * and the session is not interactive. 194 * and the session is not interactive.
199 */ 195 */
200 196
201static inline bool tcp_in_quickack_mode(const struct sock *sk) 197static inline int tcp_in_quickack_mode(const struct sock *sk)
202{ 198{
203 const struct inet_connection_sock *icsk = inet_csk(sk); 199 const struct inet_connection_sock *icsk = inet_csk(sk);
204
205 return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; 200 return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
206} 201}
207 202
@@ -211,7 +206,7 @@ static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp)
211 tp->ecn_flags |= TCP_ECN_QUEUE_CWR; 206 tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
212} 207}
213 208
214static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb) 209static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, struct sk_buff *skb)
215{ 210{
216 if (tcp_hdr(skb)->cwr) 211 if (tcp_hdr(skb)->cwr)
217 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; 212 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
@@ -222,49 +217,36 @@ static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp)
222 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; 217 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
223} 218}
224 219
225static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) 220static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb)
226{ 221{
227 if (!(tp->ecn_flags & TCP_ECN_OK)) 222 if (tp->ecn_flags & TCP_ECN_OK) {
228 return; 223 if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags))
229 224 tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
230 switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
231 case INET_ECN_NOT_ECT:
232 /* Funny extension: if ECT is not set on a segment, 225 /* Funny extension: if ECT is not set on a segment,
233 * and we already seen ECT on a previous segment, 226 * it is surely retransmit. It is not in ECN RFC,
234 * it is probably a retransmit. 227 * but Linux follows this rule. */
235 */ 228 else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags)))
236 if (tp->ecn_flags & TCP_ECN_SEEN)
237 tcp_enter_quickack_mode((struct sock *)tp);
238 break;
239 case INET_ECN_CE:
240 if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
241 /* Better not delay acks, sender can have a very low cwnd */
242 tcp_enter_quickack_mode((struct sock *)tp); 229 tcp_enter_quickack_mode((struct sock *)tp);
243 tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
244 }
245 /* fallinto */
246 default:
247 tp->ecn_flags |= TCP_ECN_SEEN;
248 } 230 }
249} 231}
250 232
251static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) 233static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, struct tcphdr *th)
252{ 234{
253 if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr)) 235 if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))
254 tp->ecn_flags &= ~TCP_ECN_OK; 236 tp->ecn_flags &= ~TCP_ECN_OK;
255} 237}
256 238
257static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) 239static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, struct tcphdr *th)
258{ 240{
259 if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr)) 241 if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))
260 tp->ecn_flags &= ~TCP_ECN_OK; 242 tp->ecn_flags &= ~TCP_ECN_OK;
261} 243}
262 244
263static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) 245static inline int TCP_ECN_rcv_ecn_echo(struct tcp_sock *tp, struct tcphdr *th)
264{ 246{
265 if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) 247 if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
266 return true; 248 return 1;
267 return false; 249 return 0;
268} 250}
269 251
270/* Buffer size and advertised window tuning. 252/* Buffer size and advertised window tuning.
@@ -274,11 +256,14 @@ static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr
274 256
275static void tcp_fixup_sndbuf(struct sock *sk) 257static void tcp_fixup_sndbuf(struct sock *sk)
276{ 258{
277 int sndmem = SKB_TRUESIZE(tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER); 259 int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 +
260 sizeof(struct sk_buff);
278 261
279 sndmem *= TCP_INIT_CWND; 262 if (sk->sk_sndbuf < 3 * sndmem) {
280 if (sk->sk_sndbuf < sndmem) 263 sk->sk_sndbuf = 3 * sndmem;
281 sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); 264 if (sk->sk_sndbuf > sysctl_tcp_wmem[2])
265 sk->sk_sndbuf = sysctl_tcp_wmem[2];
266 }
282} 267}
283 268
284/* 2. Tuning advertised window (window_clamp, rcv_ssthresh) 269/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
@@ -324,14 +309,14 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
324 return 0; 309 return 0;
325} 310}
326 311
327static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) 312static void tcp_grow_window(struct sock *sk, struct sk_buff *skb)
328{ 313{
329 struct tcp_sock *tp = tcp_sk(sk); 314 struct tcp_sock *tp = tcp_sk(sk);
330 315
331 /* Check #1 */ 316 /* Check #1 */
332 if (tp->rcv_ssthresh < tp->window_clamp && 317 if (tp->rcv_ssthresh < tp->window_clamp &&
333 (int)tp->rcv_ssthresh < tcp_space(sk) && 318 (int)tp->rcv_ssthresh < tcp_space(sk) &&
334 !sk_under_memory_pressure(sk)) { 319 !tcp_memory_pressure) {
335 int incr; 320 int incr;
336 321
337 /* Check #2. Increase window, if skb with such overhead 322 /* Check #2. Increase window, if skb with such overhead
@@ -343,7 +328,6 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
343 incr = __tcp_grow_window(sk, skb); 328 incr = __tcp_grow_window(sk, skb);
344 329
345 if (incr) { 330 if (incr) {
346 incr = max_t(int, incr, 2 * skb->len);
347 tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, 331 tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
348 tp->window_clamp); 332 tp->window_clamp);
349 inet_csk(sk)->icsk_ack.quick |= 1; 333 inet_csk(sk)->icsk_ack.quick |= 1;
@@ -355,30 +339,23 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
355 339
356static void tcp_fixup_rcvbuf(struct sock *sk) 340static void tcp_fixup_rcvbuf(struct sock *sk)
357{ 341{
358 u32 mss = tcp_sk(sk)->advmss; 342 struct tcp_sock *tp = tcp_sk(sk);
359 u32 icwnd = TCP_DEFAULT_INIT_RCVWND; 343 int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff);
360 int rcvmem;
361 344
362 /* Limit to 10 segments if mss <= 1460, 345 /* Try to select rcvbuf so that 4 mss-sized segments
363 * or 14600/mss segments, with a minimum of two segments. 346 * will fit to window and corresponding skbs will fit to our rcvbuf.
347 * (was 3; 4 is minimum to allow fast retransmit to work.)
364 */ 348 */
365 if (mss > 1460) 349 while (tcp_win_from_space(rcvmem) < tp->advmss)
366 icwnd = max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2);
367
368 rcvmem = SKB_TRUESIZE(mss + MAX_TCP_HEADER);
369 while (tcp_win_from_space(rcvmem) < mss)
370 rcvmem += 128; 350 rcvmem += 128;
371 351 if (sk->sk_rcvbuf < 4 * rcvmem)
372 rcvmem *= icwnd; 352 sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]);
373
374 if (sk->sk_rcvbuf < rcvmem)
375 sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
376} 353}
377 354
378/* 4. Try to fixup all. It is made immediately after connection enters 355/* 4. Try to fixup all. It is made immediately after connection enters
379 * established state. 356 * established state.
380 */ 357 */
381void tcp_init_buffer_space(struct sock *sk) 358static void tcp_init_buffer_space(struct sock *sk)
382{ 359{
383 struct tcp_sock *tp = tcp_sk(sk); 360 struct tcp_sock *tp = tcp_sk(sk);
384 int maxwin; 361 int maxwin;
@@ -421,8 +398,8 @@ static void tcp_clamp_window(struct sock *sk)
421 398
422 if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && 399 if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
423 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && 400 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
424 !sk_under_memory_pressure(sk) && 401 !tcp_memory_pressure &&
425 sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { 402 atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
426 sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), 403 sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
427 sysctl_tcp_rmem[2]); 404 sysctl_tcp_rmem[2]);
428 } 405 }
@@ -439,7 +416,7 @@ static void tcp_clamp_window(struct sock *sk)
439 */ 416 */
440void tcp_initialize_rcv_mss(struct sock *sk) 417void tcp_initialize_rcv_mss(struct sock *sk)
441{ 418{
442 const struct tcp_sock *tp = tcp_sk(sk); 419 struct tcp_sock *tp = tcp_sk(sk);
443 unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); 420 unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
444 421
445 hint = min(hint, tp->rcv_wnd / 2); 422 hint = min(hint, tp->rcv_wnd / 2);
@@ -483,11 +460,8 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
483 if (!win_dep) { 460 if (!win_dep) {
484 m -= (new_sample >> 3); 461 m -= (new_sample >> 3);
485 new_sample += m; 462 new_sample += m;
486 } else { 463 } else if (m < new_sample)
487 m <<= 3; 464 new_sample = m << 3;
488 if (m < new_sample)
489 new_sample = m;
490 }
491 } else { 465 } else {
492 /* No previous measure. */ 466 /* No previous measure. */
493 new_sample = m << 3; 467 new_sample = m << 3;
@@ -503,7 +477,7 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
503 goto new_measure; 477 goto new_measure;
504 if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) 478 if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
505 return; 479 return;
506 tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1); 480 tcp_rcv_rtt_update(tp, jiffies - tp->rcv_rtt_est.time, 1);
507 481
508new_measure: 482new_measure:
509 tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd; 483 tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
@@ -557,7 +531,8 @@ void tcp_rcv_space_adjust(struct sock *sk)
557 space /= tp->advmss; 531 space /= tp->advmss;
558 if (!space) 532 if (!space)
559 space = 1; 533 space = 1;
560 rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); 534 rcvmem = (tp->advmss + MAX_TCP_HEADER +
535 16 + sizeof(struct sk_buff));
561 while (tcp_win_from_space(rcvmem) < tp->advmss) 536 while (tcp_win_from_space(rcvmem) < tp->advmss)
562 rcvmem += 128; 537 rcvmem += 128;
563 space *= rcvmem; 538 space *= rcvmem;
@@ -707,7 +682,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
707/* Calculate rto without backoff. This is the second half of Van Jacobson's 682/* Calculate rto without backoff. This is the second half of Van Jacobson's
708 * routine referred to above. 683 * routine referred to above.
709 */ 684 */
710void tcp_set_rto(struct sock *sk) 685static inline void tcp_set_rto(struct sock *sk)
711{ 686{
712 const struct tcp_sock *tp = tcp_sk(sk); 687 const struct tcp_sock *tp = tcp_sk(sk);
713 /* Old crap is replaced with new one. 8) 688 /* Old crap is replaced with new one. 8)
@@ -734,7 +709,110 @@ void tcp_set_rto(struct sock *sk)
734 tcp_bound_rto(sk); 709 tcp_bound_rto(sk);
735} 710}
736 711
737__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) 712/* Save metrics learned by this TCP session.
713 This function is called only, when TCP finishes successfully
714 i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
715 */
716void tcp_update_metrics(struct sock *sk)
717{
718 struct tcp_sock *tp = tcp_sk(sk);
719 struct dst_entry *dst = __sk_dst_get(sk);
720
721 if (sysctl_tcp_nometrics_save)
722 return;
723
724 dst_confirm(dst);
725
726 if (dst && (dst->flags & DST_HOST)) {
727 const struct inet_connection_sock *icsk = inet_csk(sk);
728 int m;
729 unsigned long rtt;
730
731 if (icsk->icsk_backoff || !tp->srtt) {
732 /* This session failed to estimate rtt. Why?
733 * Probably, no packets returned in time.
734 * Reset our results.
735 */
736 if (!(dst_metric_locked(dst, RTAX_RTT)))
737 dst_metric_set(dst, RTAX_RTT, 0);
738 return;
739 }
740
741 rtt = dst_metric_rtt(dst, RTAX_RTT);
742 m = rtt - tp->srtt;
743
744 /* If newly calculated rtt larger than stored one,
745 * store new one. Otherwise, use EWMA. Remember,
746 * rtt overestimation is always better than underestimation.
747 */
748 if (!(dst_metric_locked(dst, RTAX_RTT))) {
749 if (m <= 0)
750 set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt);
751 else
752 set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3));
753 }
754
755 if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
756 unsigned long var;
757 if (m < 0)
758 m = -m;
759
760 /* Scale deviation to rttvar fixed point */
761 m >>= 1;
762 if (m < tp->mdev)
763 m = tp->mdev;
764
765 var = dst_metric_rtt(dst, RTAX_RTTVAR);
766 if (m >= var)
767 var = m;
768 else
769 var -= (var - m) >> 2;
770
771 set_dst_metric_rtt(dst, RTAX_RTTVAR, var);
772 }
773
774 if (tcp_in_initial_slowstart(tp)) {
775 /* Slow start still did not finish. */
776 if (dst_metric(dst, RTAX_SSTHRESH) &&
777 !dst_metric_locked(dst, RTAX_SSTHRESH) &&
778 (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
779 dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1);
780 if (!dst_metric_locked(dst, RTAX_CWND) &&
781 tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
782 dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd);
783 } else if (tp->snd_cwnd > tp->snd_ssthresh &&
784 icsk->icsk_ca_state == TCP_CA_Open) {
785 /* Cong. avoidance phase, cwnd is reliable. */
786 if (!dst_metric_locked(dst, RTAX_SSTHRESH))
787 dst_metric_set(dst, RTAX_SSTHRESH,
788 max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
789 if (!dst_metric_locked(dst, RTAX_CWND))
790 dst_metric_set(dst, RTAX_CWND,
791 (dst_metric(dst, RTAX_CWND) +
792 tp->snd_cwnd) >> 1);
793 } else {
794 /* Else slow start did not finish, cwnd is non-sense,
795 ssthresh may be also invalid.
796 */
797 if (!dst_metric_locked(dst, RTAX_CWND))
798 dst_metric_set(dst, RTAX_CWND,
799 (dst_metric(dst, RTAX_CWND) +
800 tp->snd_ssthresh) >> 1);
801 if (dst_metric(dst, RTAX_SSTHRESH) &&
802 !dst_metric_locked(dst, RTAX_SSTHRESH) &&
803 tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
804 dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh);
805 }
806
807 if (!dst_metric_locked(dst, RTAX_REORDERING)) {
808 if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
809 tp->reordering != sysctl_tcp_reordering)
810 dst_metric_set(dst, RTAX_REORDERING, tp->reordering);
811 }
812 }
813}
814
815__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
738{ 816{
739 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); 817 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
740 818
@@ -743,22 +821,124 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
743 return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 821 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
744} 822}
745 823
824/* Set slow start threshold and cwnd not falling to slow start */
825void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
826{
827 struct tcp_sock *tp = tcp_sk(sk);
828 const struct inet_connection_sock *icsk = inet_csk(sk);
829
830 tp->prior_ssthresh = 0;
831 tp->bytes_acked = 0;
832 if (icsk->icsk_ca_state < TCP_CA_CWR) {
833 tp->undo_marker = 0;
834 if (set_ssthresh)
835 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
836 tp->snd_cwnd = min(tp->snd_cwnd,
837 tcp_packets_in_flight(tp) + 1U);
838 tp->snd_cwnd_cnt = 0;
839 tp->high_seq = tp->snd_nxt;
840 tp->snd_cwnd_stamp = tcp_time_stamp;
841 TCP_ECN_queue_cwr(tp);
842
843 tcp_set_ca_state(sk, TCP_CA_CWR);
844 }
845}
846
746/* 847/*
747 * Packet counting of FACK is based on in-order assumptions, therefore TCP 848 * Packet counting of FACK is based on in-order assumptions, therefore TCP
748 * disables it when reordering is detected 849 * disables it when reordering is detected
749 */ 850 */
750void tcp_disable_fack(struct tcp_sock *tp) 851static void tcp_disable_fack(struct tcp_sock *tp)
751{ 852{
752 /* RFC3517 uses different metric in lost marker => reset on change */ 853 /* RFC3517 uses different metric in lost marker => reset on change */
753 if (tcp_is_fack(tp)) 854 if (tcp_is_fack(tp))
754 tp->lost_skb_hint = NULL; 855 tp->lost_skb_hint = NULL;
755 tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED; 856 tp->rx_opt.sack_ok &= ~2;
756} 857}
757 858
758/* Take a notice that peer is sending D-SACKs */ 859/* Take a notice that peer is sending D-SACKs */
759static void tcp_dsack_seen(struct tcp_sock *tp) 860static void tcp_dsack_seen(struct tcp_sock *tp)
760{ 861{
761 tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; 862 tp->rx_opt.sack_ok |= 4;
863}
864
865/* Initialize metrics on socket. */
866
867static void tcp_init_metrics(struct sock *sk)
868{
869 struct tcp_sock *tp = tcp_sk(sk);
870 struct dst_entry *dst = __sk_dst_get(sk);
871
872 if (dst == NULL)
873 goto reset;
874
875 dst_confirm(dst);
876
877 if (dst_metric_locked(dst, RTAX_CWND))
878 tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
879 if (dst_metric(dst, RTAX_SSTHRESH)) {
880 tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
881 if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
882 tp->snd_ssthresh = tp->snd_cwnd_clamp;
883 } else {
884 /* ssthresh may have been reduced unnecessarily during.
885 * 3WHS. Restore it back to its initial default.
886 */
887 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
888 }
889 if (dst_metric(dst, RTAX_REORDERING) &&
890 tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
891 tcp_disable_fack(tp);
892 tp->reordering = dst_metric(dst, RTAX_REORDERING);
893 }
894
895 if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0)
896 goto reset;
897
898 /* Initial rtt is determined from SYN,SYN-ACK.
899 * The segment is small and rtt may appear much
900 * less than real one. Use per-dst memory
901 * to make it more realistic.
902 *
903 * A bit of theory. RTT is time passed after "normal" sized packet
904 * is sent until it is ACKed. In normal circumstances sending small
905 * packets force peer to delay ACKs and calculation is correct too.
906 * The algorithm is adaptive and, provided we follow specs, it
907 * NEVER underestimate RTT. BUT! If peer tries to make some clever
908 * tricks sort of "quick acks" for time long enough to decrease RTT
909 * to low value, and then abruptly stops to do it and starts to delay
910 * ACKs, wait for troubles.
911 */
912 if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) {
913 tp->srtt = dst_metric_rtt(dst, RTAX_RTT);
914 tp->rtt_seq = tp->snd_nxt;
915 }
916 if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) {
917 tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR);
918 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
919 }
920 tcp_set_rto(sk);
921reset:
922 if (tp->srtt == 0) {
923 /* RFC2988bis: We've failed to get a valid RTT sample from
924 * 3WHS. This is most likely due to retransmission,
925 * including spurious one. Reset the RTO back to 3secs
926 * from the more aggressive 1sec to avoid more spurious
927 * retransmission.
928 */
929 tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
930 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
931 }
932 /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
933 * retransmitted. In light of RFC2988bis' more aggressive 1sec
934 * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
935 * retransmission has occurred.
936 */
937 if (tp->total_retrans > 1)
938 tp->snd_cwnd = 1;
939 else
940 tp->snd_cwnd = tcp_init_cwnd(tp, dst);
941 tp->snd_cwnd_stamp = tcp_time_stamp;
762} 942}
763 943
764static void tcp_update_reordering(struct sock *sk, const int metric, 944static void tcp_update_reordering(struct sock *sk, const int metric,
@@ -782,18 +962,15 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
782 962
783 NET_INC_STATS_BH(sock_net(sk), mib_idx); 963 NET_INC_STATS_BH(sock_net(sk), mib_idx);
784#if FASTRETRANS_DEBUG > 1 964#if FASTRETRANS_DEBUG > 1
785 pr_debug("Disorder%d %d %u f%u s%u rr%d\n", 965 printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n",
786 tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, 966 tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
787 tp->reordering, 967 tp->reordering,
788 tp->fackets_out, 968 tp->fackets_out,
789 tp->sacked_out, 969 tp->sacked_out,
790 tp->undo_marker ? tp->undo_retrans : 0); 970 tp->undo_marker ? tp->undo_retrans : 0);
791#endif 971#endif
792 tcp_disable_fack(tp); 972 tcp_disable_fack(tp);
793 } 973 }
794
795 if (metric > 0)
796 tcp_disable_early_retrans(tp);
797} 974}
798 975
799/* This must be called before lost_out is incremented */ 976/* This must be called before lost_out is incremented */
@@ -851,11 +1028,13 @@ static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
851 * These 6 states form finite state machine, controlled by the following events: 1028 * These 6 states form finite state machine, controlled by the following events:
852 * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue()) 1029 * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
853 * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue()) 1030 * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
854 * 3. Loss detection event of two flavors: 1031 * 3. Loss detection event of one of three flavors:
855 * A. Scoreboard estimator decided the packet is lost. 1032 * A. Scoreboard estimator decided the packet is lost.
856 * A'. Reno "three dupacks" marks head of queue lost. 1033 * A'. Reno "three dupacks" marks head of queue lost.
857 * A''. Its FACK modification, head until snd.fack is lost. 1034 * A''. Its FACK modfication, head until snd.fack is lost.
858 * B. SACK arrives sacking SND.NXT at the moment, when the 1035 * B. SACK arrives sacking data transmitted after never retransmitted
1036 * hole was sent out.
1037 * C. SACK arrives sacking SND.NXT at the moment, when the
859 * segment was retransmitted. 1038 * segment was retransmitted.
860 * 4. D-SACK added new rule: D-SACK changes any tag to S. 1039 * 4. D-SACK added new rule: D-SACK changes any tag to S.
861 * 1040 *
@@ -924,36 +1103,36 @@ static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
924 * the exact amount is rather hard to quantify. However, tp->max_window can 1103 * the exact amount is rather hard to quantify. However, tp->max_window can
925 * be used as an exaggerated estimate. 1104 * be used as an exaggerated estimate.
926 */ 1105 */
927static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack, 1106static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack,
928 u32 start_seq, u32 end_seq) 1107 u32 start_seq, u32 end_seq)
929{ 1108{
930 /* Too far in future, or reversed (interpretation is ambiguous) */ 1109 /* Too far in future, or reversed (interpretation is ambiguous) */
931 if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq)) 1110 if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq))
932 return false; 1111 return 0;
933 1112
934 /* Nasty start_seq wrap-around check (see comments above) */ 1113 /* Nasty start_seq wrap-around check (see comments above) */
935 if (!before(start_seq, tp->snd_nxt)) 1114 if (!before(start_seq, tp->snd_nxt))
936 return false; 1115 return 0;
937 1116
938 /* In outstanding window? ...This is valid exit for D-SACKs too. 1117 /* In outstanding window? ...This is valid exit for D-SACKs too.
939 * start_seq == snd_una is non-sensical (see comments above) 1118 * start_seq == snd_una is non-sensical (see comments above)
940 */ 1119 */
941 if (after(start_seq, tp->snd_una)) 1120 if (after(start_seq, tp->snd_una))
942 return true; 1121 return 1;
943 1122
944 if (!is_dsack || !tp->undo_marker) 1123 if (!is_dsack || !tp->undo_marker)
945 return false; 1124 return 0;
946 1125
947 /* ...Then it's D-SACK, and must reside below snd_una completely */ 1126 /* ...Then it's D-SACK, and must reside below snd_una completely */
948 if (after(end_seq, tp->snd_una)) 1127 if (after(end_seq, tp->snd_una))
949 return false; 1128 return 0;
950 1129
951 if (!before(start_seq, tp->undo_marker)) 1130 if (!before(start_seq, tp->undo_marker))
952 return true; 1131 return 1;
953 1132
954 /* Too old */ 1133 /* Too old */
955 if (!after(end_seq, tp->undo_marker)) 1134 if (!after(end_seq, tp->undo_marker))
956 return false; 1135 return 0;
957 1136
958 /* Undo_marker boundary crossing (overestimates a lot). Known already: 1137 /* Undo_marker boundary crossing (overestimates a lot). Known already:
959 * start_seq < undo_marker and end_seq >= undo_marker. 1138 * start_seq < undo_marker and end_seq >= undo_marker.
@@ -962,7 +1141,7 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
962} 1141}
963 1142
964/* Check for lost retransmit. This superb idea is borrowed from "ratehalving". 1143/* Check for lost retransmit. This superb idea is borrowed from "ratehalving".
965 * Event "B". Later note: FACK people cheated me again 8), we have to account 1144 * Event "C". Later note: FACK people cheated me again 8), we have to account
966 * for reordering! Ugly, but should help. 1145 * for reordering! Ugly, but should help.
967 * 1146 *
968 * Search retransmitted skbs from write_queue that were sent when snd_nxt was 1147 * Search retransmitted skbs from write_queue that were sent when snd_nxt was
@@ -1025,17 +1204,17 @@ static void tcp_mark_lost_retrans(struct sock *sk)
1025 tp->lost_retrans_low = new_low_seq; 1204 tp->lost_retrans_low = new_low_seq;
1026} 1205}
1027 1206
1028static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, 1207static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,
1029 struct tcp_sack_block_wire *sp, int num_sacks, 1208 struct tcp_sack_block_wire *sp, int num_sacks,
1030 u32 prior_snd_una) 1209 u32 prior_snd_una)
1031{ 1210{
1032 struct tcp_sock *tp = tcp_sk(sk); 1211 struct tcp_sock *tp = tcp_sk(sk);
1033 u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq); 1212 u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
1034 u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq); 1213 u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
1035 bool dup_sack = false; 1214 int dup_sack = 0;
1036 1215
1037 if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { 1216 if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
1038 dup_sack = true; 1217 dup_sack = 1;
1039 tcp_dsack_seen(tp); 1218 tcp_dsack_seen(tp);
1040 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV); 1219 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
1041 } else if (num_sacks > 1) { 1220 } else if (num_sacks > 1) {
@@ -1044,7 +1223,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
1044 1223
1045 if (!after(end_seq_0, end_seq_1) && 1224 if (!after(end_seq_0, end_seq_1) &&
1046 !before(start_seq_0, start_seq_1)) { 1225 !before(start_seq_0, start_seq_1)) {
1047 dup_sack = true; 1226 dup_sack = 1;
1048 tcp_dsack_seen(tp); 1227 tcp_dsack_seen(tp);
1049 NET_INC_STATS_BH(sock_net(sk), 1228 NET_INC_STATS_BH(sock_net(sk),
1050 LINUX_MIB_TCPDSACKOFORECV); 1229 LINUX_MIB_TCPDSACKOFORECV);
@@ -1075,10 +1254,9 @@ struct tcp_sacktag_state {
1075 * FIXME: this could be merged to shift decision code 1254 * FIXME: this could be merged to shift decision code
1076 */ 1255 */
1077static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, 1256static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
1078 u32 start_seq, u32 end_seq) 1257 u32 start_seq, u32 end_seq)
1079{ 1258{
1080 int err; 1259 int in_sack, err;
1081 bool in_sack;
1082 unsigned int pkt_len; 1260 unsigned int pkt_len;
1083 unsigned int mss; 1261 unsigned int mss;
1084 1262
@@ -1120,26 +1298,25 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
1120 return in_sack; 1298 return in_sack;
1121} 1299}
1122 1300
1123/* Mark the given newly-SACKed range as such, adjusting counters and hints. */ 1301static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
1124static u8 tcp_sacktag_one(struct sock *sk, 1302 struct tcp_sacktag_state *state,
1125 struct tcp_sacktag_state *state, u8 sacked, 1303 int dup_sack, int pcount)
1126 u32 start_seq, u32 end_seq,
1127 bool dup_sack, int pcount)
1128{ 1304{
1129 struct tcp_sock *tp = tcp_sk(sk); 1305 struct tcp_sock *tp = tcp_sk(sk);
1306 u8 sacked = TCP_SKB_CB(skb)->sacked;
1130 int fack_count = state->fack_count; 1307 int fack_count = state->fack_count;
1131 1308
1132 /* Account D-SACK for retransmitted packet. */ 1309 /* Account D-SACK for retransmitted packet. */
1133 if (dup_sack && (sacked & TCPCB_RETRANS)) { 1310 if (dup_sack && (sacked & TCPCB_RETRANS)) {
1134 if (tp->undo_marker && tp->undo_retrans && 1311 if (tp->undo_marker && tp->undo_retrans &&
1135 after(end_seq, tp->undo_marker)) 1312 after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
1136 tp->undo_retrans--; 1313 tp->undo_retrans--;
1137 if (sacked & TCPCB_SACKED_ACKED) 1314 if (sacked & TCPCB_SACKED_ACKED)
1138 state->reord = min(fack_count, state->reord); 1315 state->reord = min(fack_count, state->reord);
1139 } 1316 }
1140 1317
1141 /* Nothing to do; acked frame is about to be dropped (was ACKed). */ 1318 /* Nothing to do; acked frame is about to be dropped (was ACKed). */
1142 if (!after(end_seq, tp->snd_una)) 1319 if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
1143 return sacked; 1320 return sacked;
1144 1321
1145 if (!(sacked & TCPCB_SACKED_ACKED)) { 1322 if (!(sacked & TCPCB_SACKED_ACKED)) {
@@ -1158,13 +1335,13 @@ static u8 tcp_sacktag_one(struct sock *sk,
1158 /* New sack for not retransmitted frame, 1335 /* New sack for not retransmitted frame,
1159 * which was in hole. It is reordering. 1336 * which was in hole. It is reordering.
1160 */ 1337 */
1161 if (before(start_seq, 1338 if (before(TCP_SKB_CB(skb)->seq,
1162 tcp_highest_sack_seq(tp))) 1339 tcp_highest_sack_seq(tp)))
1163 state->reord = min(fack_count, 1340 state->reord = min(fack_count,
1164 state->reord); 1341 state->reord);
1165 1342
1166 /* SACK enhanced F-RTO (RFC4138; Appendix B) */ 1343 /* SACK enhanced F-RTO (RFC4138; Appendix B) */
1167 if (!after(end_seq, tp->frto_highmark)) 1344 if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark))
1168 state->flag |= FLAG_ONLY_ORIG_SACKED; 1345 state->flag |= FLAG_ONLY_ORIG_SACKED;
1169 } 1346 }
1170 1347
@@ -1182,7 +1359,8 @@ static u8 tcp_sacktag_one(struct sock *sk,
1182 1359
1183 /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ 1360 /* Lost marker hint past SACKed? Tweak RFC3517 cnt */
1184 if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) && 1361 if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) &&
1185 before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq)) 1362 before(TCP_SKB_CB(skb)->seq,
1363 TCP_SKB_CB(tp->lost_skb_hint)->seq))
1186 tp->lost_cnt_hint += pcount; 1364 tp->lost_cnt_hint += pcount;
1187 1365
1188 if (fack_count > tp->fackets_out) 1366 if (fack_count > tp->fackets_out)
@@ -1201,30 +1379,16 @@ static u8 tcp_sacktag_one(struct sock *sk,
1201 return sacked; 1379 return sacked;
1202} 1380}
1203 1381
1204/* Shift newly-SACKed bytes from this skb to the immediately previous 1382static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1205 * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. 1383 struct tcp_sacktag_state *state,
1206 */ 1384 unsigned int pcount, int shifted, int mss,
1207static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, 1385 int dup_sack)
1208 struct tcp_sacktag_state *state,
1209 unsigned int pcount, int shifted, int mss,
1210 bool dup_sack)
1211{ 1386{
1212 struct tcp_sock *tp = tcp_sk(sk); 1387 struct tcp_sock *tp = tcp_sk(sk);
1213 struct sk_buff *prev = tcp_write_queue_prev(sk, skb); 1388 struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
1214 u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */
1215 u32 end_seq = start_seq + shifted; /* end of newly-SACKed */
1216 1389
1217 BUG_ON(!pcount); 1390 BUG_ON(!pcount);
1218 1391
1219 /* Adjust counters and hints for the newly sacked sequence
1220 * range but discard the return value since prev is already
1221 * marked. We must tag the range first because the seq
1222 * advancement below implicitly advances
1223 * tcp_highest_sack_seq() when skb is highest_sack.
1224 */
1225 tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
1226 start_seq, end_seq, dup_sack, pcount);
1227
1228 if (skb == tp->lost_skb_hint) 1392 if (skb == tp->lost_skb_hint)
1229 tp->lost_cnt_hint += pcount; 1393 tp->lost_cnt_hint += pcount;
1230 1394
@@ -1251,13 +1415,16 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1251 skb_shinfo(skb)->gso_type = 0; 1415 skb_shinfo(skb)->gso_type = 0;
1252 } 1416 }
1253 1417
1418 /* We discard results */
1419 tcp_sacktag_one(skb, sk, state, dup_sack, pcount);
1420
1254 /* Difference in this won't matter, both ACKed by the same cumul. ACK */ 1421 /* Difference in this won't matter, both ACKed by the same cumul. ACK */
1255 TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); 1422 TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
1256 1423
1257 if (skb->len > 0) { 1424 if (skb->len > 0) {
1258 BUG_ON(!tcp_skb_pcount(skb)); 1425 BUG_ON(!tcp_skb_pcount(skb));
1259 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED); 1426 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED);
1260 return false; 1427 return 0;
1261 } 1428 }
1262 1429
1263 /* Whole SKB was eaten :-) */ 1430 /* Whole SKB was eaten :-) */
@@ -1271,7 +1438,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1271 tp->lost_cnt_hint -= tcp_skb_pcount(prev); 1438 tp->lost_cnt_hint -= tcp_skb_pcount(prev);
1272 } 1439 }
1273 1440
1274 TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(prev)->tcp_flags; 1441 TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(prev)->flags;
1275 if (skb == tcp_highest_sack(sk)) 1442 if (skb == tcp_highest_sack(sk))
1276 tcp_advance_highest_sack(sk, skb); 1443 tcp_advance_highest_sack(sk, skb);
1277 1444
@@ -1280,19 +1447,19 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1280 1447
1281 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED); 1448 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED);
1282 1449
1283 return true; 1450 return 1;
1284} 1451}
1285 1452
1286/* I wish gso_size would have a bit more sane initialization than 1453/* I wish gso_size would have a bit more sane initialization than
1287 * something-or-zero which complicates things 1454 * something-or-zero which complicates things
1288 */ 1455 */
1289static int tcp_skb_seglen(const struct sk_buff *skb) 1456static int tcp_skb_seglen(struct sk_buff *skb)
1290{ 1457{
1291 return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb); 1458 return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb);
1292} 1459}
1293 1460
1294/* Shifting pages past head area doesn't work */ 1461/* Shifting pages past head area doesn't work */
1295static int skb_can_shift(const struct sk_buff *skb) 1462static int skb_can_shift(struct sk_buff *skb)
1296{ 1463{
1297 return !skb_headlen(skb) && skb_is_nonlinear(skb); 1464 return !skb_headlen(skb) && skb_is_nonlinear(skb);
1298} 1465}
@@ -1303,7 +1470,7 @@ static int skb_can_shift(const struct sk_buff *skb)
1303static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, 1470static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
1304 struct tcp_sacktag_state *state, 1471 struct tcp_sacktag_state *state,
1305 u32 start_seq, u32 end_seq, 1472 u32 start_seq, u32 end_seq,
1306 bool dup_sack) 1473 int dup_sack)
1307{ 1474{
1308 struct tcp_sock *tp = tcp_sk(sk); 1475 struct tcp_sock *tp = tcp_sk(sk);
1309 struct sk_buff *prev; 1476 struct sk_buff *prev;
@@ -1398,10 +1565,6 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
1398 } 1565 }
1399 } 1566 }
1400 1567
1401 /* tcp_sacktag_one() won't SACK-tag ranges below snd_una */
1402 if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
1403 goto fallback;
1404
1405 if (!skb_shift(prev, skb, len)) 1568 if (!skb_shift(prev, skb, len))
1406 goto fallback; 1569 goto fallback;
1407 if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack)) 1570 if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
@@ -1442,14 +1605,14 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1442 struct tcp_sack_block *next_dup, 1605 struct tcp_sack_block *next_dup,
1443 struct tcp_sacktag_state *state, 1606 struct tcp_sacktag_state *state,
1444 u32 start_seq, u32 end_seq, 1607 u32 start_seq, u32 end_seq,
1445 bool dup_sack_in) 1608 int dup_sack_in)
1446{ 1609{
1447 struct tcp_sock *tp = tcp_sk(sk); 1610 struct tcp_sock *tp = tcp_sk(sk);
1448 struct sk_buff *tmp; 1611 struct sk_buff *tmp;
1449 1612
1450 tcp_for_write_queue_from(skb, sk) { 1613 tcp_for_write_queue_from(skb, sk) {
1451 int in_sack = 0; 1614 int in_sack = 0;
1452 bool dup_sack = dup_sack_in; 1615 int dup_sack = dup_sack_in;
1453 1616
1454 if (skb == tcp_send_head(sk)) 1617 if (skb == tcp_send_head(sk))
1455 break; 1618 break;
@@ -1464,7 +1627,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1464 next_dup->start_seq, 1627 next_dup->start_seq,
1465 next_dup->end_seq); 1628 next_dup->end_seq);
1466 if (in_sack > 0) 1629 if (in_sack > 0)
1467 dup_sack = true; 1630 dup_sack = 1;
1468 } 1631 }
1469 1632
1470 /* skb reference here is a bit tricky to get right, since 1633 /* skb reference here is a bit tricky to get right, since
@@ -1492,14 +1655,10 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
1492 break; 1655 break;
1493 1656
1494 if (in_sack) { 1657 if (in_sack) {
1495 TCP_SKB_CB(skb)->sacked = 1658 TCP_SKB_CB(skb)->sacked = tcp_sacktag_one(skb, sk,
1496 tcp_sacktag_one(sk, 1659 state,
1497 state, 1660 dup_sack,
1498 TCP_SKB_CB(skb)->sacked, 1661 tcp_skb_pcount(skb));
1499 TCP_SKB_CB(skb)->seq,
1500 TCP_SKB_CB(skb)->end_seq,
1501 dup_sack,
1502 tcp_skb_pcount(skb));
1503 1662
1504 if (!before(TCP_SKB_CB(skb)->seq, 1663 if (!before(TCP_SKB_CB(skb)->seq,
1505 tcp_highest_sack_seq(tp))) 1664 tcp_highest_sack_seq(tp)))
@@ -1549,19 +1708,19 @@ static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
1549 return skb; 1708 return skb;
1550} 1709}
1551 1710
1552static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_block *cache) 1711static int tcp_sack_cache_ok(struct tcp_sock *tp, struct tcp_sack_block *cache)
1553{ 1712{
1554 return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); 1713 return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
1555} 1714}
1556 1715
1557static int 1716static int
1558tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, 1717tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
1559 u32 prior_snd_una) 1718 u32 prior_snd_una)
1560{ 1719{
1561 const struct inet_connection_sock *icsk = inet_csk(sk); 1720 const struct inet_connection_sock *icsk = inet_csk(sk);
1562 struct tcp_sock *tp = tcp_sk(sk); 1721 struct tcp_sock *tp = tcp_sk(sk);
1563 const unsigned char *ptr = (skb_transport_header(ack_skb) + 1722 unsigned char *ptr = (skb_transport_header(ack_skb) +
1564 TCP_SKB_CB(ack_skb)->sacked); 1723 TCP_SKB_CB(ack_skb)->sacked);
1565 struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); 1724 struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);
1566 struct tcp_sack_block sp[TCP_NUM_SACKS]; 1725 struct tcp_sack_block sp[TCP_NUM_SACKS];
1567 struct tcp_sack_block *cache; 1726 struct tcp_sack_block *cache;
@@ -1569,7 +1728,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1569 struct sk_buff *skb; 1728 struct sk_buff *skb;
1570 int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3); 1729 int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
1571 int used_sacks; 1730 int used_sacks;
1572 bool found_dup_sack = false; 1731 int found_dup_sack = 0;
1573 int i, j; 1732 int i, j;
1574 int first_sack_index; 1733 int first_sack_index;
1575 1734
@@ -1600,7 +1759,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1600 used_sacks = 0; 1759 used_sacks = 0;
1601 first_sack_index = 0; 1760 first_sack_index = 0;
1602 for (i = 0; i < num_sacks; i++) { 1761 for (i = 0; i < num_sacks; i++) {
1603 bool dup_sack = !i && found_dup_sack; 1762 int dup_sack = !i && found_dup_sack;
1604 1763
1605 sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq); 1764 sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq);
1606 sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq); 1765 sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq);
@@ -1667,12 +1826,16 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1667 while (i < used_sacks) { 1826 while (i < used_sacks) {
1668 u32 start_seq = sp[i].start_seq; 1827 u32 start_seq = sp[i].start_seq;
1669 u32 end_seq = sp[i].end_seq; 1828 u32 end_seq = sp[i].end_seq;
1670 bool dup_sack = (found_dup_sack && (i == first_sack_index)); 1829 int dup_sack = (found_dup_sack && (i == first_sack_index));
1671 struct tcp_sack_block *next_dup = NULL; 1830 struct tcp_sack_block *next_dup = NULL;
1672 1831
1673 if (found_dup_sack && ((i + 1) == first_sack_index)) 1832 if (found_dup_sack && ((i + 1) == first_sack_index))
1674 next_dup = &sp[i + 1]; 1833 next_dup = &sp[i + 1];
1675 1834
1835 /* Event "B" in the comment above. */
1836 if (after(end_seq, tp->high_seq))
1837 state.flag |= FLAG_DATA_LOST;
1838
1676 /* Skip too early cached blocks */ 1839 /* Skip too early cached blocks */
1677 while (tcp_sack_cache_ok(tp, cache) && 1840 while (tcp_sack_cache_ok(tp, cache) &&
1678 !before(start_seq, cache->end_seq)) 1841 !before(start_seq, cache->end_seq))
@@ -1769,9 +1932,9 @@ out:
1769} 1932}
1770 1933
1771/* Limits sacked_out so that sum with lost_out isn't ever larger than 1934/* Limits sacked_out so that sum with lost_out isn't ever larger than
1772 * packets_out. Returns false if sacked_out adjustement wasn't necessary. 1935 * packets_out. Returns zero if sacked_out adjustement wasn't necessary.
1773 */ 1936 */
1774static bool tcp_limit_reno_sacked(struct tcp_sock *tp) 1937static int tcp_limit_reno_sacked(struct tcp_sock *tp)
1775{ 1938{
1776 u32 holes; 1939 u32 holes;
1777 1940
@@ -1780,9 +1943,9 @@ static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
1780 1943
1781 if ((tp->sacked_out + holes) > tp->packets_out) { 1944 if ((tp->sacked_out + holes) > tp->packets_out) {
1782 tp->sacked_out = tp->packets_out - holes; 1945 tp->sacked_out = tp->packets_out - holes;
1783 return true; 1946 return 1;
1784 } 1947 }
1785 return false; 1948 return 0;
1786} 1949}
1787 1950
1788/* If we receive more dupacks than we expected counting segments 1951/* If we receive more dupacks than we expected counting segments
@@ -1836,40 +1999,40 @@ static int tcp_is_sackfrto(const struct tcp_sock *tp)
1836/* F-RTO can only be used if TCP has never retransmitted anything other than 1999/* F-RTO can only be used if TCP has never retransmitted anything other than
1837 * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here) 2000 * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here)
1838 */ 2001 */
1839bool tcp_use_frto(struct sock *sk) 2002int tcp_use_frto(struct sock *sk)
1840{ 2003{
1841 const struct tcp_sock *tp = tcp_sk(sk); 2004 const struct tcp_sock *tp = tcp_sk(sk);
1842 const struct inet_connection_sock *icsk = inet_csk(sk); 2005 const struct inet_connection_sock *icsk = inet_csk(sk);
1843 struct sk_buff *skb; 2006 struct sk_buff *skb;
1844 2007
1845 if (!sysctl_tcp_frto) 2008 if (!sysctl_tcp_frto)
1846 return false; 2009 return 0;
1847 2010
1848 /* MTU probe and F-RTO won't really play nicely along currently */ 2011 /* MTU probe and F-RTO won't really play nicely along currently */
1849 if (icsk->icsk_mtup.probe_size) 2012 if (icsk->icsk_mtup.probe_size)
1850 return false; 2013 return 0;
1851 2014
1852 if (tcp_is_sackfrto(tp)) 2015 if (tcp_is_sackfrto(tp))
1853 return true; 2016 return 1;
1854 2017
1855 /* Avoid expensive walking of rexmit queue if possible */ 2018 /* Avoid expensive walking of rexmit queue if possible */
1856 if (tp->retrans_out > 1) 2019 if (tp->retrans_out > 1)
1857 return false; 2020 return 0;
1858 2021
1859 skb = tcp_write_queue_head(sk); 2022 skb = tcp_write_queue_head(sk);
1860 if (tcp_skb_is_last(sk, skb)) 2023 if (tcp_skb_is_last(sk, skb))
1861 return true; 2024 return 1;
1862 skb = tcp_write_queue_next(sk, skb); /* Skips head */ 2025 skb = tcp_write_queue_next(sk, skb); /* Skips head */
1863 tcp_for_write_queue_from(skb, sk) { 2026 tcp_for_write_queue_from(skb, sk) {
1864 if (skb == tcp_send_head(sk)) 2027 if (skb == tcp_send_head(sk))
1865 break; 2028 break;
1866 if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) 2029 if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
1867 return false; 2030 return 0;
1868 /* Short-circuit when first non-SACKed skb has been checked */ 2031 /* Short-circuit when first non-SACKed skb has been checked */
1869 if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) 2032 if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
1870 break; 2033 break;
1871 } 2034 }
1872 return true; 2035 return 1;
1873} 2036}
1874 2037
1875/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO 2038/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO
@@ -2105,7 +2268,7 @@ void tcp_enter_loss(struct sock *sk, int how)
2105 * 2268 *
2106 * Do processing similar to RTO timeout. 2269 * Do processing similar to RTO timeout.
2107 */ 2270 */
2108static bool tcp_check_sack_reneging(struct sock *sk, int flag) 2271static int tcp_check_sack_reneging(struct sock *sk, int flag)
2109{ 2272{
2110 if (flag & FLAG_SACK_RENEGING) { 2273 if (flag & FLAG_SACK_RENEGING) {
2111 struct inet_connection_sock *icsk = inet_csk(sk); 2274 struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2116,12 +2279,12 @@ static bool tcp_check_sack_reneging(struct sock *sk, int flag)
2116 tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); 2279 tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
2117 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 2280 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2118 icsk->icsk_rto, TCP_RTO_MAX); 2281 icsk->icsk_rto, TCP_RTO_MAX);
2119 return true; 2282 return 1;
2120 } 2283 }
2121 return false; 2284 return 0;
2122} 2285}
2123 2286
2124static inline int tcp_fackets_out(const struct tcp_sock *tp) 2287static inline int tcp_fackets_out(struct tcp_sock *tp)
2125{ 2288{
2126 return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out; 2289 return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out;
2127} 2290}
@@ -2141,41 +2304,19 @@ static inline int tcp_fackets_out(const struct tcp_sock *tp)
2141 * they differ. Since neither occurs due to loss, TCP should really 2304 * they differ. Since neither occurs due to loss, TCP should really
2142 * ignore them. 2305 * ignore them.
2143 */ 2306 */
2144static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) 2307static inline int tcp_dupack_heuristics(struct tcp_sock *tp)
2145{ 2308{
2146 return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; 2309 return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
2147} 2310}
2148 2311
2149static bool tcp_pause_early_retransmit(struct sock *sk, int flag) 2312static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
2150{
2151 struct tcp_sock *tp = tcp_sk(sk);
2152 unsigned long delay;
2153
2154 /* Delay early retransmit and entering fast recovery for
2155 * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
2156 * available, or RTO is scheduled to fire first.
2157 */
2158 if (sysctl_tcp_early_retrans < 2 || (flag & FLAG_ECE) || !tp->srtt)
2159 return false;
2160
2161 delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2));
2162 if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
2163 return false;
2164
2165 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, TCP_RTO_MAX);
2166 tp->early_retrans_delayed = 1;
2167 return true;
2168}
2169
2170static inline int tcp_skb_timedout(const struct sock *sk,
2171 const struct sk_buff *skb)
2172{ 2313{
2173 return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto; 2314 return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto;
2174} 2315}
2175 2316
2176static inline int tcp_head_timedout(const struct sock *sk) 2317static inline int tcp_head_timedout(struct sock *sk)
2177{ 2318{
2178 const struct tcp_sock *tp = tcp_sk(sk); 2319 struct tcp_sock *tp = tcp_sk(sk);
2179 2320
2180 return tp->packets_out && 2321 return tp->packets_out &&
2181 tcp_skb_timedout(sk, tcp_write_queue_head(sk)); 2322 tcp_skb_timedout(sk, tcp_write_queue_head(sk));
@@ -2274,28 +2415,28 @@ static inline int tcp_head_timedout(const struct sock *sk)
2274 * Main question: may we further continue forward transmission 2415 * Main question: may we further continue forward transmission
2275 * with the same cwnd? 2416 * with the same cwnd?
2276 */ 2417 */
2277static bool tcp_time_to_recover(struct sock *sk, int flag) 2418static int tcp_time_to_recover(struct sock *sk)
2278{ 2419{
2279 struct tcp_sock *tp = tcp_sk(sk); 2420 struct tcp_sock *tp = tcp_sk(sk);
2280 __u32 packets_out; 2421 __u32 packets_out;
2281 2422
2282 /* Do not perform any recovery during F-RTO algorithm */ 2423 /* Do not perform any recovery during F-RTO algorithm */
2283 if (tp->frto_counter) 2424 if (tp->frto_counter)
2284 return false; 2425 return 0;
2285 2426
2286 /* Trick#1: The loss is proven. */ 2427 /* Trick#1: The loss is proven. */
2287 if (tp->lost_out) 2428 if (tp->lost_out)
2288 return true; 2429 return 1;
2289 2430
2290 /* Not-A-Trick#2 : Classic rule... */ 2431 /* Not-A-Trick#2 : Classic rule... */
2291 if (tcp_dupack_heuristics(tp) > tp->reordering) 2432 if (tcp_dupack_heuristics(tp) > tp->reordering)
2292 return true; 2433 return 1;
2293 2434
2294 /* Trick#3 : when we use RFC2988 timer restart, fast 2435 /* Trick#3 : when we use RFC2988 timer restart, fast
2295 * retransmit can be triggered by timeout of queue head. 2436 * retransmit can be triggered by timeout of queue head.
2296 */ 2437 */
2297 if (tcp_is_fack(tp) && tcp_head_timedout(sk)) 2438 if (tcp_is_fack(tp) && tcp_head_timedout(sk))
2298 return true; 2439 return 1;
2299 2440
2300 /* Trick#4: It is still not OK... But will it be useful to delay 2441 /* Trick#4: It is still not OK... But will it be useful to delay
2301 * recovery more? 2442 * recovery more?
@@ -2307,7 +2448,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
2307 /* We have nothing to send. This connection is limited 2448 /* We have nothing to send. This connection is limited
2308 * either by receiver window or by application. 2449 * either by receiver window or by application.
2309 */ 2450 */
2310 return true; 2451 return 1;
2311 } 2452 }
2312 2453
2313 /* If a thin stream is detected, retransmit after first 2454 /* If a thin stream is detected, retransmit after first
@@ -2318,19 +2459,9 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
2318 if ((tp->thin_dupack || sysctl_tcp_thin_dupack) && 2459 if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
2319 tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 && 2460 tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
2320 tcp_is_sack(tp) && !tcp_send_head(sk)) 2461 tcp_is_sack(tp) && !tcp_send_head(sk))
2321 return true; 2462 return 1;
2322 2463
2323 /* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious 2464 return 0;
2324 * retransmissions due to small network reorderings, we implement
2325 * Mitigation A.3 in the RFC and delay the retransmission for a short
2326 * interval if appropriate.
2327 */
2328 if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
2329 (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) &&
2330 !tcp_may_send_now(sk))
2331 return !tcp_pause_early_retransmit(sk, flag);
2332
2333 return false;
2334} 2465}
2335 2466
2336/* New heuristics: it is possible only after we switched to restart timer 2467/* New heuristics: it is possible only after we switched to restart timer
@@ -2371,11 +2502,8 @@ static void tcp_timeout_skbs(struct sock *sk)
2371 tcp_verify_left_out(tp); 2502 tcp_verify_left_out(tp);
2372} 2503}
2373 2504
2374/* Detect loss in event "A" above by marking head of queue up as lost. 2505/* Mark head of queue up as lost. With RFC3517 SACK, the packets is
2375 * For FACK or non-SACK(Reno) senders, the first "packets" number of segments 2506 * is against sacked "cnt", otherwise it's against facked "cnt"
2376 * are considered lost. For RFC3517 SACK, a segment is considered lost if it
2377 * has at least tp->reordering SACKed seqments above it; "packets" refers to
2378 * the maximum SACKed segments to pass before reaching this limit.
2379 */ 2507 */
2380static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) 2508static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
2381{ 2509{
@@ -2384,8 +2512,6 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
2384 int cnt, oldcnt; 2512 int cnt, oldcnt;
2385 int err; 2513 int err;
2386 unsigned int mss; 2514 unsigned int mss;
2387 /* Use SACK to deduce losses of new sequences sent during recovery */
2388 const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq;
2389 2515
2390 WARN_ON(packets > tp->packets_out); 2516 WARN_ON(packets > tp->packets_out);
2391 if (tp->lost_skb_hint) { 2517 if (tp->lost_skb_hint) {
@@ -2407,7 +2533,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
2407 tp->lost_skb_hint = skb; 2533 tp->lost_skb_hint = skb;
2408 tp->lost_cnt_hint = cnt; 2534 tp->lost_cnt_hint = cnt;
2409 2535
2410 if (after(TCP_SKB_CB(skb)->end_seq, loss_high)) 2536 if (after(TCP_SKB_CB(skb)->end_seq, tp->high_seq))
2411 break; 2537 break;
2412 2538
2413 oldcnt = cnt; 2539 oldcnt = cnt;
@@ -2417,7 +2543,6 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
2417 2543
2418 if (cnt > packets) { 2544 if (cnt > packets) {
2419 if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) || 2545 if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
2420 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
2421 (oldcnt >= packets)) 2546 (oldcnt >= packets))
2422 break; 2547 break;
2423 2548
@@ -2470,10 +2595,39 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
2470 tp->snd_cwnd_stamp = tcp_time_stamp; 2595 tp->snd_cwnd_stamp = tcp_time_stamp;
2471} 2596}
2472 2597
2598/* Lower bound on congestion window is slow start threshold
2599 * unless congestion avoidance choice decides to overide it.
2600 */
2601static inline u32 tcp_cwnd_min(const struct sock *sk)
2602{
2603 const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
2604
2605 return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh;
2606}
2607
2608/* Decrease cwnd each second ack. */
2609static void tcp_cwnd_down(struct sock *sk, int flag)
2610{
2611 struct tcp_sock *tp = tcp_sk(sk);
2612 int decr = tp->snd_cwnd_cnt + 1;
2613
2614 if ((flag & (FLAG_ANY_PROGRESS | FLAG_DSACKING_ACK)) ||
2615 (tcp_is_reno(tp) && !(flag & FLAG_NOT_DUP))) {
2616 tp->snd_cwnd_cnt = decr & 1;
2617 decr >>= 1;
2618
2619 if (decr && tp->snd_cwnd > tcp_cwnd_min(sk))
2620 tp->snd_cwnd -= decr;
2621
2622 tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1);
2623 tp->snd_cwnd_stamp = tcp_time_stamp;
2624 }
2625}
2626
2473/* Nothing was retransmitted or returned timestamp is less 2627/* Nothing was retransmitted or returned timestamp is less
2474 * than timestamp of the first retransmission. 2628 * than timestamp of the first retransmission.
2475 */ 2629 */
2476static inline bool tcp_packet_delayed(const struct tcp_sock *tp) 2630static inline int tcp_packet_delayed(struct tcp_sock *tp)
2477{ 2631{
2478 return !tp->retrans_stamp || 2632 return !tp->retrans_stamp ||
2479 (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && 2633 (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
@@ -2489,22 +2643,22 @@ static void DBGUNDO(struct sock *sk, const char *msg)
2489 struct inet_sock *inet = inet_sk(sk); 2643 struct inet_sock *inet = inet_sk(sk);
2490 2644
2491 if (sk->sk_family == AF_INET) { 2645 if (sk->sk_family == AF_INET) {
2492 pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n", 2646 printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
2493 msg, 2647 msg,
2494 &inet->inet_daddr, ntohs(inet->inet_dport), 2648 &inet->inet_daddr, ntohs(inet->inet_dport),
2495 tp->snd_cwnd, tcp_left_out(tp), 2649 tp->snd_cwnd, tcp_left_out(tp),
2496 tp->snd_ssthresh, tp->prior_ssthresh, 2650 tp->snd_ssthresh, tp->prior_ssthresh,
2497 tp->packets_out); 2651 tp->packets_out);
2498 } 2652 }
2499#if IS_ENABLED(CONFIG_IPV6) 2653#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
2500 else if (sk->sk_family == AF_INET6) { 2654 else if (sk->sk_family == AF_INET6) {
2501 struct ipv6_pinfo *np = inet6_sk(sk); 2655 struct ipv6_pinfo *np = inet6_sk(sk);
2502 pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n", 2656 printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
2503 msg, 2657 msg,
2504 &np->daddr, ntohs(inet->inet_dport), 2658 &np->daddr, ntohs(inet->inet_dport),
2505 tp->snd_cwnd, tcp_left_out(tp), 2659 tp->snd_cwnd, tcp_left_out(tp),
2506 tp->snd_ssthresh, tp->prior_ssthresh, 2660 tp->snd_ssthresh, tp->prior_ssthresh,
2507 tp->packets_out); 2661 tp->packets_out);
2508 } 2662 }
2509#endif 2663#endif
2510} 2664}
@@ -2534,13 +2688,13 @@ static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh)
2534 tp->snd_cwnd_stamp = tcp_time_stamp; 2688 tp->snd_cwnd_stamp = tcp_time_stamp;
2535} 2689}
2536 2690
2537static inline bool tcp_may_undo(const struct tcp_sock *tp) 2691static inline int tcp_may_undo(struct tcp_sock *tp)
2538{ 2692{
2539 return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp)); 2693 return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));
2540} 2694}
2541 2695
2542/* People celebrate: "We love our President!" */ 2696/* People celebrate: "We love our President!" */
2543static bool tcp_try_undo_recovery(struct sock *sk) 2697static int tcp_try_undo_recovery(struct sock *sk)
2544{ 2698{
2545 struct tcp_sock *tp = tcp_sk(sk); 2699 struct tcp_sock *tp = tcp_sk(sk);
2546 2700
@@ -2565,10 +2719,10 @@ static bool tcp_try_undo_recovery(struct sock *sk)
2565 * is ACKed. For Reno it is MUST to prevent false 2719 * is ACKed. For Reno it is MUST to prevent false
2566 * fast retransmits (RFC2582). SACK TCP is safe. */ 2720 * fast retransmits (RFC2582). SACK TCP is safe. */
2567 tcp_moderate_cwnd(tp); 2721 tcp_moderate_cwnd(tp);
2568 return true; 2722 return 1;
2569 } 2723 }
2570 tcp_set_ca_state(sk, TCP_CA_Open); 2724 tcp_set_ca_state(sk, TCP_CA_Open);
2571 return false; 2725 return 0;
2572} 2726}
2573 2727
2574/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */ 2728/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
@@ -2598,19 +2752,19 @@ static void tcp_try_undo_dsack(struct sock *sk)
2598 * that successive retransmissions of a segment must not advance 2752 * that successive retransmissions of a segment must not advance
2599 * retrans_stamp under any conditions. 2753 * retrans_stamp under any conditions.
2600 */ 2754 */
2601static bool tcp_any_retrans_done(const struct sock *sk) 2755static int tcp_any_retrans_done(struct sock *sk)
2602{ 2756{
2603 const struct tcp_sock *tp = tcp_sk(sk); 2757 struct tcp_sock *tp = tcp_sk(sk);
2604 struct sk_buff *skb; 2758 struct sk_buff *skb;
2605 2759
2606 if (tp->retrans_out) 2760 if (tp->retrans_out)
2607 return true; 2761 return 1;
2608 2762
2609 skb = tcp_write_queue_head(sk); 2763 skb = tcp_write_queue_head(sk);
2610 if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) 2764 if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
2611 return true; 2765 return 1;
2612 2766
2613 return false; 2767 return 0;
2614} 2768}
2615 2769
2616/* Undo during fast recovery after partial ACK. */ 2770/* Undo during fast recovery after partial ACK. */
@@ -2644,7 +2798,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked)
2644} 2798}
2645 2799
2646/* Undo during loss recovery after partial ACK. */ 2800/* Undo during loss recovery after partial ACK. */
2647static bool tcp_try_undo_loss(struct sock *sk) 2801static int tcp_try_undo_loss(struct sock *sk)
2648{ 2802{
2649 struct tcp_sock *tp = tcp_sk(sk); 2803 struct tcp_sock *tp = tcp_sk(sk);
2650 2804
@@ -2666,91 +2820,28 @@ static bool tcp_try_undo_loss(struct sock *sk)
2666 tp->undo_marker = 0; 2820 tp->undo_marker = 0;
2667 if (tcp_is_sack(tp)) 2821 if (tcp_is_sack(tp))
2668 tcp_set_ca_state(sk, TCP_CA_Open); 2822 tcp_set_ca_state(sk, TCP_CA_Open);
2669 return true; 2823 return 1;
2670 }
2671 return false;
2672}
2673
2674/* The cwnd reduction in CWR and Recovery use the PRR algorithm
2675 * https://datatracker.ietf.org/doc/draft-ietf-tcpm-proportional-rate-reduction/
2676 * It computes the number of packets to send (sndcnt) based on packets newly
2677 * delivered:
2678 * 1) If the packets in flight is larger than ssthresh, PRR spreads the
2679 * cwnd reductions across a full RTT.
2680 * 2) If packets in flight is lower than ssthresh (such as due to excess
2681 * losses and/or application stalls), do not perform any further cwnd
2682 * reductions, but instead slow start up to ssthresh.
2683 */
2684static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
2685{
2686 struct tcp_sock *tp = tcp_sk(sk);
2687
2688 tp->high_seq = tp->snd_nxt;
2689 tp->bytes_acked = 0;
2690 tp->snd_cwnd_cnt = 0;
2691 tp->prior_cwnd = tp->snd_cwnd;
2692 tp->prr_delivered = 0;
2693 tp->prr_out = 0;
2694 if (set_ssthresh)
2695 tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
2696 TCP_ECN_queue_cwr(tp);
2697}
2698
2699static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked,
2700 int fast_rexmit)
2701{
2702 struct tcp_sock *tp = tcp_sk(sk);
2703 int sndcnt = 0;
2704 int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
2705
2706 tp->prr_delivered += newly_acked_sacked;
2707 if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
2708 u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
2709 tp->prior_cwnd - 1;
2710 sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
2711 } else {
2712 sndcnt = min_t(int, delta,
2713 max_t(int, tp->prr_delivered - tp->prr_out,
2714 newly_acked_sacked) + 1);
2715 } 2824 }
2716 2825 return 0;
2717 sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
2718 tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
2719} 2826}
2720 2827
2721static inline void tcp_end_cwnd_reduction(struct sock *sk) 2828static inline void tcp_complete_cwr(struct sock *sk)
2722{ 2829{
2723 struct tcp_sock *tp = tcp_sk(sk); 2830 struct tcp_sock *tp = tcp_sk(sk);
2724 2831 /* Do not moderate cwnd if it's already undone in cwr or recovery */
2725 /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ 2832 if (tp->undo_marker && tp->snd_cwnd > tp->snd_ssthresh) {
2726 if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR ||
2727 (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
2728 tp->snd_cwnd = tp->snd_ssthresh; 2833 tp->snd_cwnd = tp->snd_ssthresh;
2729 tp->snd_cwnd_stamp = tcp_time_stamp; 2834 tp->snd_cwnd_stamp = tcp_time_stamp;
2730 } 2835 }
2731 tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); 2836 tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
2732} 2837}
2733 2838
2734/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
2735void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
2736{
2737 struct tcp_sock *tp = tcp_sk(sk);
2738
2739 tp->prior_ssthresh = 0;
2740 tp->bytes_acked = 0;
2741 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
2742 tp->undo_marker = 0;
2743 tcp_init_cwnd_reduction(sk, set_ssthresh);
2744 tcp_set_ca_state(sk, TCP_CA_CWR);
2745 }
2746}
2747
2748static void tcp_try_keep_open(struct sock *sk) 2839static void tcp_try_keep_open(struct sock *sk)
2749{ 2840{
2750 struct tcp_sock *tp = tcp_sk(sk); 2841 struct tcp_sock *tp = tcp_sk(sk);
2751 int state = TCP_CA_Open; 2842 int state = TCP_CA_Open;
2752 2843
2753 if (tcp_left_out(tp) || tcp_any_retrans_done(sk)) 2844 if (tcp_left_out(tp) || tcp_any_retrans_done(sk) || tp->undo_marker)
2754 state = TCP_CA_Disorder; 2845 state = TCP_CA_Disorder;
2755 2846
2756 if (inet_csk(sk)->icsk_ca_state != state) { 2847 if (inet_csk(sk)->icsk_ca_state != state) {
@@ -2759,7 +2850,7 @@ static void tcp_try_keep_open(struct sock *sk)
2759 } 2850 }
2760} 2851}
2761 2852
2762static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked) 2853static void tcp_try_to_open(struct sock *sk, int flag)
2763{ 2854{
2764 struct tcp_sock *tp = tcp_sk(sk); 2855 struct tcp_sock *tp = tcp_sk(sk);
2765 2856
@@ -2773,10 +2864,9 @@ static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked)
2773 2864
2774 if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { 2865 if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
2775 tcp_try_keep_open(sk); 2866 tcp_try_keep_open(sk);
2776 if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open) 2867 tcp_moderate_cwnd(tp);
2777 tcp_moderate_cwnd(tp);
2778 } else { 2868 } else {
2779 tcp_cwnd_reduction(sk, newly_acked_sacked, 0); 2869 tcp_cwnd_down(sk, flag);
2780 } 2870 }
2781} 2871}
2782 2872
@@ -2858,30 +2948,6 @@ void tcp_simple_retransmit(struct sock *sk)
2858} 2948}
2859EXPORT_SYMBOL(tcp_simple_retransmit); 2949EXPORT_SYMBOL(tcp_simple_retransmit);
2860 2950
2861static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
2862{
2863 struct tcp_sock *tp = tcp_sk(sk);
2864 int mib_idx;
2865
2866 if (tcp_is_reno(tp))
2867 mib_idx = LINUX_MIB_TCPRENORECOVERY;
2868 else
2869 mib_idx = LINUX_MIB_TCPSACKRECOVERY;
2870
2871 NET_INC_STATS_BH(sock_net(sk), mib_idx);
2872
2873 tp->prior_ssthresh = 0;
2874 tp->undo_marker = tp->snd_una;
2875 tp->undo_retrans = tp->retrans_out;
2876
2877 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
2878 if (!ece_ack)
2879 tp->prior_ssthresh = tcp_current_ssthresh(sk);
2880 tcp_init_cwnd_reduction(sk, true);
2881 }
2882 tcp_set_ca_state(sk, TCP_CA_Recovery);
2883}
2884
2885/* Process an event, which can update packets-in-flight not trivially. 2951/* Process an event, which can update packets-in-flight not trivially.
2886 * Main goal of this function is to calculate new estimate for left_out, 2952 * Main goal of this function is to calculate new estimate for left_out,
2887 * taking into account both packets sitting in receiver's buffer and 2953 * taking into account both packets sitting in receiver's buffer and
@@ -2893,16 +2959,14 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
2893 * It does _not_ decide what to send, it is made in function 2959 * It does _not_ decide what to send, it is made in function
2894 * tcp_xmit_retransmit_queue(). 2960 * tcp_xmit_retransmit_queue().
2895 */ 2961 */
2896static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, 2962static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
2897 int prior_sacked, bool is_dupack,
2898 int flag)
2899{ 2963{
2900 struct inet_connection_sock *icsk = inet_csk(sk); 2964 struct inet_connection_sock *icsk = inet_csk(sk);
2901 struct tcp_sock *tp = tcp_sk(sk); 2965 struct tcp_sock *tp = tcp_sk(sk);
2966 int is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
2902 int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && 2967 int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
2903 (tcp_fackets_out(tp) > tp->reordering)); 2968 (tcp_fackets_out(tp) > tp->reordering));
2904 int newly_acked_sacked = 0; 2969 int fast_rexmit = 0, mib_idx;
2905 int fast_rexmit = 0;
2906 2970
2907 if (WARN_ON(!tp->packets_out && tp->sacked_out)) 2971 if (WARN_ON(!tp->packets_out && tp->sacked_out))
2908 tp->sacked_out = 0; 2972 tp->sacked_out = 0;
@@ -2918,10 +2982,19 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
2918 if (tcp_check_sack_reneging(sk, flag)) 2982 if (tcp_check_sack_reneging(sk, flag))
2919 return; 2983 return;
2920 2984
2921 /* C. Check consistency of the current state. */ 2985 /* C. Process data loss notification, provided it is valid. */
2986 if (tcp_is_fack(tp) && (flag & FLAG_DATA_LOST) &&
2987 before(tp->snd_una, tp->high_seq) &&
2988 icsk->icsk_ca_state != TCP_CA_Open &&
2989 tp->fackets_out > tp->reordering) {
2990 tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0);
2991 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS);
2992 }
2993
2994 /* D. Check consistency of the current state. */
2922 tcp_verify_left_out(tp); 2995 tcp_verify_left_out(tp);
2923 2996
2924 /* D. Check state exit conditions. State can be terminated 2997 /* E. Check state exit conditions. State can be terminated
2925 * when high_seq is ACKed. */ 2998 * when high_seq is ACKed. */
2926 if (icsk->icsk_ca_state == TCP_CA_Open) { 2999 if (icsk->icsk_ca_state == TCP_CA_Open) {
2927 WARN_ON(tp->retrans_out != 0); 3000 WARN_ON(tp->retrans_out != 0);
@@ -2938,7 +3011,18 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
2938 /* CWR is to be held something *above* high_seq 3011 /* CWR is to be held something *above* high_seq
2939 * is ACKed for CWR bit to reach receiver. */ 3012 * is ACKed for CWR bit to reach receiver. */
2940 if (tp->snd_una != tp->high_seq) { 3013 if (tp->snd_una != tp->high_seq) {
2941 tcp_end_cwnd_reduction(sk); 3014 tcp_complete_cwr(sk);
3015 tcp_set_ca_state(sk, TCP_CA_Open);
3016 }
3017 break;
3018
3019 case TCP_CA_Disorder:
3020 tcp_try_undo_dsack(sk);
3021 if (!tp->undo_marker ||
3022 /* For SACK case do not Open to allow to undo
3023 * catching for all duplicate ACKs. */
3024 tcp_is_reno(tp) || tp->snd_una != tp->high_seq) {
3025 tp->undo_marker = 0;
2942 tcp_set_ca_state(sk, TCP_CA_Open); 3026 tcp_set_ca_state(sk, TCP_CA_Open);
2943 } 3027 }
2944 break; 3028 break;
@@ -2948,12 +3032,12 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
2948 tcp_reset_reno_sack(tp); 3032 tcp_reset_reno_sack(tp);
2949 if (tcp_try_undo_recovery(sk)) 3033 if (tcp_try_undo_recovery(sk))
2950 return; 3034 return;
2951 tcp_end_cwnd_reduction(sk); 3035 tcp_complete_cwr(sk);
2952 break; 3036 break;
2953 } 3037 }
2954 } 3038 }
2955 3039
2956 /* E. Process state. */ 3040 /* F. Process state. */
2957 switch (icsk->icsk_ca_state) { 3041 switch (icsk->icsk_ca_state) {
2958 case TCP_CA_Recovery: 3042 case TCP_CA_Recovery:
2959 if (!(flag & FLAG_SND_UNA_ADVANCED)) { 3043 if (!(flag & FLAG_SND_UNA_ADVANCED)) {
@@ -2961,7 +3045,6 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
2961 tcp_add_reno_sack(sk); 3045 tcp_add_reno_sack(sk);
2962 } else 3046 } else
2963 do_lost = tcp_try_undo_partial(sk, pkts_acked); 3047 do_lost = tcp_try_undo_partial(sk, pkts_acked);
2964 newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked;
2965 break; 3048 break;
2966 case TCP_CA_Loss: 3049 case TCP_CA_Loss:
2967 if (flag & FLAG_DATA_ACKED) 3050 if (flag & FLAG_DATA_ACKED)
@@ -2983,13 +3066,12 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
2983 if (is_dupack) 3066 if (is_dupack)
2984 tcp_add_reno_sack(sk); 3067 tcp_add_reno_sack(sk);
2985 } 3068 }
2986 newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked;
2987 3069
2988 if (icsk->icsk_ca_state <= TCP_CA_Disorder) 3070 if (icsk->icsk_ca_state == TCP_CA_Disorder)
2989 tcp_try_undo_dsack(sk); 3071 tcp_try_undo_dsack(sk);
2990 3072
2991 if (!tcp_time_to_recover(sk, flag)) { 3073 if (!tcp_time_to_recover(sk)) {
2992 tcp_try_to_open(sk, flag, newly_acked_sacked); 3074 tcp_try_to_open(sk, flag);
2993 return; 3075 return;
2994 } 3076 }
2995 3077
@@ -3005,13 +3087,35 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
3005 } 3087 }
3006 3088
3007 /* Otherwise enter Recovery state */ 3089 /* Otherwise enter Recovery state */
3008 tcp_enter_recovery(sk, (flag & FLAG_ECE)); 3090
3091 if (tcp_is_reno(tp))
3092 mib_idx = LINUX_MIB_TCPRENORECOVERY;
3093 else
3094 mib_idx = LINUX_MIB_TCPSACKRECOVERY;
3095
3096 NET_INC_STATS_BH(sock_net(sk), mib_idx);
3097
3098 tp->high_seq = tp->snd_nxt;
3099 tp->prior_ssthresh = 0;
3100 tp->undo_marker = tp->snd_una;
3101 tp->undo_retrans = tp->retrans_out;
3102
3103 if (icsk->icsk_ca_state < TCP_CA_CWR) {
3104 if (!(flag & FLAG_ECE))
3105 tp->prior_ssthresh = tcp_current_ssthresh(sk);
3106 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
3107 TCP_ECN_queue_cwr(tp);
3108 }
3109
3110 tp->bytes_acked = 0;
3111 tp->snd_cwnd_cnt = 0;
3112 tcp_set_ca_state(sk, TCP_CA_Recovery);
3009 fast_rexmit = 1; 3113 fast_rexmit = 1;
3010 } 3114 }
3011 3115
3012 if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk))) 3116 if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk)))
3013 tcp_update_scoreboard(sk, fast_rexmit); 3117 tcp_update_scoreboard(sk, fast_rexmit);
3014 tcp_cwnd_reduction(sk, newly_acked_sacked, fast_rexmit); 3118 tcp_cwnd_down(sk, flag);
3015 tcp_xmit_retransmit_queue(sk); 3119 tcp_xmit_retransmit_queue(sk);
3016} 3120}
3017 3121
@@ -3086,53 +3190,16 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
3086/* Restart timer after forward progress on connection. 3190/* Restart timer after forward progress on connection.
3087 * RFC2988 recommends to restart timer to now+rto. 3191 * RFC2988 recommends to restart timer to now+rto.
3088 */ 3192 */
3089void tcp_rearm_rto(struct sock *sk) 3193static void tcp_rearm_rto(struct sock *sk)
3090{ 3194{
3091 struct tcp_sock *tp = tcp_sk(sk); 3195 struct tcp_sock *tp = tcp_sk(sk);
3092 3196
3093 /* If the retrans timer is currently being used by Fast Open
3094 * for SYN-ACK retrans purpose, stay put.
3095 */
3096 if (tp->fastopen_rsk)
3097 return;
3098
3099 if (!tp->packets_out) { 3197 if (!tp->packets_out) {
3100 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); 3198 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
3101 } else { 3199 } else {
3102 u32 rto = inet_csk(sk)->icsk_rto; 3200 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
3103 /* Offset the time elapsed after installing regular RTO */ 3201 inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
3104 if (tp->early_retrans_delayed) {
3105 struct sk_buff *skb = tcp_write_queue_head(sk);
3106 const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto;
3107 s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
3108 /* delta may not be positive if the socket is locked
3109 * when the delayed ER timer fires and is rescheduled.
3110 */
3111 if (delta > 0)
3112 rto = delta;
3113 }
3114 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
3115 TCP_RTO_MAX);
3116 } 3202 }
3117 tp->early_retrans_delayed = 0;
3118}
3119
3120/* This function is called when the delayed ER timer fires. TCP enters
3121 * fast recovery and performs fast-retransmit.
3122 */
3123void tcp_resume_early_retransmit(struct sock *sk)
3124{
3125 struct tcp_sock *tp = tcp_sk(sk);
3126
3127 tcp_rearm_rto(sk);
3128
3129 /* Stop if ER is disabled after the delayed ER timer is scheduled */
3130 if (!tp->do_early_retrans)
3131 return;
3132
3133 tcp_enter_recovery(sk, false);
3134 tcp_update_scoreboard(sk, 1);
3135 tcp_xmit_retransmit_queue(sk);
3136} 3203}
3137 3204
3138/* If we get here, the whole TSO packet has not been acked. */ 3205/* If we get here, the whole TSO packet has not been acked. */
@@ -3167,7 +3234,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3167 const struct inet_connection_sock *icsk = inet_csk(sk); 3234 const struct inet_connection_sock *icsk = inet_csk(sk);
3168 struct sk_buff *skb; 3235 struct sk_buff *skb;
3169 u32 now = tcp_time_stamp; 3236 u32 now = tcp_time_stamp;
3170 int fully_acked = true; 3237 int fully_acked = 1;
3171 int flag = 0; 3238 int flag = 0;
3172 u32 pkts_acked = 0; 3239 u32 pkts_acked = 0;
3173 u32 reord = tp->packets_out; 3240 u32 reord = tp->packets_out;
@@ -3191,7 +3258,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3191 if (!acked_pcount) 3258 if (!acked_pcount)
3192 break; 3259 break;
3193 3260
3194 fully_acked = false; 3261 fully_acked = 0;
3195 } else { 3262 } else {
3196 acked_pcount = tcp_skb_pcount(skb); 3263 acked_pcount = tcp_skb_pcount(skb);
3197 } 3264 }
@@ -3229,7 +3296,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3229 * connection startup slow start one packet too 3296 * connection startup slow start one packet too
3230 * quickly. This is severely frowned upon behavior. 3297 * quickly. This is severely frowned upon behavior.
3231 */ 3298 */
3232 if (!(scb->tcp_flags & TCPHDR_SYN)) { 3299 if (!(scb->flags & TCPHDR_SYN)) {
3233 flag |= FLAG_DATA_ACKED; 3300 flag |= FLAG_DATA_ACKED;
3234 } else { 3301 } else {
3235 flag |= FLAG_SYN_ACKED; 3302 flag |= FLAG_SYN_ACKED;
@@ -3308,18 +3375,18 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3308 if (!tp->packets_out && tcp_is_sack(tp)) { 3375 if (!tp->packets_out && tcp_is_sack(tp)) {
3309 icsk = inet_csk(sk); 3376 icsk = inet_csk(sk);
3310 if (tp->lost_out) { 3377 if (tp->lost_out) {
3311 pr_debug("Leak l=%u %d\n", 3378 printk(KERN_DEBUG "Leak l=%u %d\n",
3312 tp->lost_out, icsk->icsk_ca_state); 3379 tp->lost_out, icsk->icsk_ca_state);
3313 tp->lost_out = 0; 3380 tp->lost_out = 0;
3314 } 3381 }
3315 if (tp->sacked_out) { 3382 if (tp->sacked_out) {
3316 pr_debug("Leak s=%u %d\n", 3383 printk(KERN_DEBUG "Leak s=%u %d\n",
3317 tp->sacked_out, icsk->icsk_ca_state); 3384 tp->sacked_out, icsk->icsk_ca_state);
3318 tp->sacked_out = 0; 3385 tp->sacked_out = 0;
3319 } 3386 }
3320 if (tp->retrans_out) { 3387 if (tp->retrans_out) {
3321 pr_debug("Leak r=%u %d\n", 3388 printk(KERN_DEBUG "Leak r=%u %d\n",
3322 tp->retrans_out, icsk->icsk_ca_state); 3389 tp->retrans_out, icsk->icsk_ca_state);
3323 tp->retrans_out = 0; 3390 tp->retrans_out = 0;
3324 } 3391 }
3325 } 3392 }
@@ -3347,23 +3414,23 @@ static void tcp_ack_probe(struct sock *sk)
3347 } 3414 }
3348} 3415}
3349 3416
3350static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag) 3417static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
3351{ 3418{
3352 return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || 3419 return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
3353 inet_csk(sk)->icsk_ca_state != TCP_CA_Open; 3420 inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
3354} 3421}
3355 3422
3356static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) 3423static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
3357{ 3424{
3358 const struct tcp_sock *tp = tcp_sk(sk); 3425 const struct tcp_sock *tp = tcp_sk(sk);
3359 return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && 3426 return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
3360 !tcp_in_cwnd_reduction(sk); 3427 !((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR));
3361} 3428}
3362 3429
3363/* Check that window update is acceptable. 3430/* Check that window update is acceptable.
3364 * The function assumes that snd_una<=ack<=snd_next. 3431 * The function assumes that snd_una<=ack<=snd_next.
3365 */ 3432 */
3366static inline bool tcp_may_update_window(const struct tcp_sock *tp, 3433static inline int tcp_may_update_window(const struct tcp_sock *tp,
3367 const u32 ack, const u32 ack_seq, 3434 const u32 ack, const u32 ack_seq,
3368 const u32 nwin) 3435 const u32 nwin)
3369{ 3436{
@@ -3377,7 +3444,7 @@ static inline bool tcp_may_update_window(const struct tcp_sock *tp,
3377 * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2 3444 * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
3378 * and in FreeBSD. NetBSD's one is even worse.) is wrong. 3445 * and in FreeBSD. NetBSD's one is even worse.) is wrong.
3379 */ 3446 */
3380static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack, 3447static int tcp_ack_update_window(struct sock *sk, struct sk_buff *skb, u32 ack,
3381 u32 ack_seq) 3448 u32 ack_seq)
3382{ 3449{
3383 struct tcp_sock *tp = tcp_sk(sk); 3450 struct tcp_sock *tp = tcp_sk(sk);
@@ -3425,9 +3492,9 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
3425} 3492}
3426 3493
3427/* A conservative spurious RTO response algorithm: reduce cwnd using 3494/* A conservative spurious RTO response algorithm: reduce cwnd using
3428 * PRR and continue in congestion avoidance. 3495 * rate halving and continue in congestion avoidance.
3429 */ 3496 */
3430static void tcp_cwr_spur_to_response(struct sock *sk) 3497static void tcp_ratehalving_spur_to_response(struct sock *sk)
3431{ 3498{
3432 tcp_enter_cwr(sk, 0); 3499 tcp_enter_cwr(sk, 0);
3433} 3500}
@@ -3435,7 +3502,7 @@ static void tcp_cwr_spur_to_response(struct sock *sk)
3435static void tcp_undo_spur_to_response(struct sock *sk, int flag) 3502static void tcp_undo_spur_to_response(struct sock *sk, int flag)
3436{ 3503{
3437 if (flag & FLAG_ECE) 3504 if (flag & FLAG_ECE)
3438 tcp_cwr_spur_to_response(sk); 3505 tcp_ratehalving_spur_to_response(sk);
3439 else 3506 else
3440 tcp_undo_cwr(sk, true); 3507 tcp_undo_cwr(sk, true);
3441} 3508}
@@ -3470,7 +3537,7 @@ static void tcp_undo_spur_to_response(struct sock *sk, int flag)
3470 * to prove that the RTO is indeed spurious. It transfers the control 3537 * to prove that the RTO is indeed spurious. It transfers the control
3471 * from F-RTO to the conventional RTO recovery 3538 * from F-RTO to the conventional RTO recovery
3472 */ 3539 */
3473static bool tcp_process_frto(struct sock *sk, int flag) 3540static int tcp_process_frto(struct sock *sk, int flag)
3474{ 3541{
3475 struct tcp_sock *tp = tcp_sk(sk); 3542 struct tcp_sock *tp = tcp_sk(sk);
3476 3543
@@ -3486,7 +3553,7 @@ static bool tcp_process_frto(struct sock *sk, int flag)
3486 3553
3487 if (!before(tp->snd_una, tp->frto_highmark)) { 3554 if (!before(tp->snd_una, tp->frto_highmark)) {
3488 tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag); 3555 tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag);
3489 return true; 3556 return 1;
3490 } 3557 }
3491 3558
3492 if (!tcp_is_sackfrto(tp)) { 3559 if (!tcp_is_sackfrto(tp)) {
@@ -3495,19 +3562,19 @@ static bool tcp_process_frto(struct sock *sk, int flag)
3495 * data, winupdate 3562 * data, winupdate
3496 */ 3563 */
3497 if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP)) 3564 if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP))
3498 return true; 3565 return 1;
3499 3566
3500 if (!(flag & FLAG_DATA_ACKED)) { 3567 if (!(flag & FLAG_DATA_ACKED)) {
3501 tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3), 3568 tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3),
3502 flag); 3569 flag);
3503 return true; 3570 return 1;
3504 } 3571 }
3505 } else { 3572 } else {
3506 if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) { 3573 if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) {
3507 /* Prevent sending of new data. */ 3574 /* Prevent sending of new data. */
3508 tp->snd_cwnd = min(tp->snd_cwnd, 3575 tp->snd_cwnd = min(tp->snd_cwnd,
3509 tcp_packets_in_flight(tp)); 3576 tcp_packets_in_flight(tp));
3510 return true; 3577 return 1;
3511 } 3578 }
3512 3579
3513 if ((tp->frto_counter >= 2) && 3580 if ((tp->frto_counter >= 2) &&
@@ -3517,10 +3584,10 @@ static bool tcp_process_frto(struct sock *sk, int flag)
3517 /* RFC4138 shortcoming (see comment above) */ 3584 /* RFC4138 shortcoming (see comment above) */
3518 if (!(flag & FLAG_FORWARD_PROGRESS) && 3585 if (!(flag & FLAG_FORWARD_PROGRESS) &&
3519 (flag & FLAG_NOT_DUP)) 3586 (flag & FLAG_NOT_DUP))
3520 return true; 3587 return 1;
3521 3588
3522 tcp_enter_frto_loss(sk, 3, flag); 3589 tcp_enter_frto_loss(sk, 3, flag);
3523 return true; 3590 return 1;
3524 } 3591 }
3525 } 3592 }
3526 3593
@@ -3532,7 +3599,7 @@ static bool tcp_process_frto(struct sock *sk, int flag)
3532 if (!tcp_may_send_now(sk)) 3599 if (!tcp_may_send_now(sk))
3533 tcp_enter_frto_loss(sk, 2, flag); 3600 tcp_enter_frto_loss(sk, 2, flag);
3534 3601
3535 return true; 3602 return 1;
3536 } else { 3603 } else {
3537 switch (sysctl_tcp_frto_response) { 3604 switch (sysctl_tcp_frto_response) {
3538 case 2: 3605 case 2:
@@ -3542,61 +3609,34 @@ static bool tcp_process_frto(struct sock *sk, int flag)
3542 tcp_conservative_spur_to_response(tp); 3609 tcp_conservative_spur_to_response(tp);
3543 break; 3610 break;
3544 default: 3611 default:
3545 tcp_cwr_spur_to_response(sk); 3612 tcp_ratehalving_spur_to_response(sk);
3546 break; 3613 break;
3547 } 3614 }
3548 tp->frto_counter = 0; 3615 tp->frto_counter = 0;
3549 tp->undo_marker = 0; 3616 tp->undo_marker = 0;
3550 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); 3617 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS);
3551 } 3618 }
3552 return false; 3619 return 0;
3553}
3554
3555/* RFC 5961 7 [ACK Throttling] */
3556static void tcp_send_challenge_ack(struct sock *sk)
3557{
3558 /* unprotected vars, we dont care of overwrites */
3559 static u32 challenge_timestamp;
3560 static unsigned int challenge_count;
3561 u32 now = jiffies / HZ;
3562
3563 if (now != challenge_timestamp) {
3564 challenge_timestamp = now;
3565 challenge_count = 0;
3566 }
3567 if (++challenge_count <= sysctl_tcp_challenge_ack_limit) {
3568 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
3569 tcp_send_ack(sk);
3570 }
3571} 3620}
3572 3621
3573/* This routine deals with incoming acks, but not outgoing ones. */ 3622/* This routine deals with incoming acks, but not outgoing ones. */
3574static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) 3623static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
3575{ 3624{
3576 struct inet_connection_sock *icsk = inet_csk(sk); 3625 struct inet_connection_sock *icsk = inet_csk(sk);
3577 struct tcp_sock *tp = tcp_sk(sk); 3626 struct tcp_sock *tp = tcp_sk(sk);
3578 u32 prior_snd_una = tp->snd_una; 3627 u32 prior_snd_una = tp->snd_una;
3579 u32 ack_seq = TCP_SKB_CB(skb)->seq; 3628 u32 ack_seq = TCP_SKB_CB(skb)->seq;
3580 u32 ack = TCP_SKB_CB(skb)->ack_seq; 3629 u32 ack = TCP_SKB_CB(skb)->ack_seq;
3581 bool is_dupack = false;
3582 u32 prior_in_flight; 3630 u32 prior_in_flight;
3583 u32 prior_fackets; 3631 u32 prior_fackets;
3584 int prior_packets; 3632 int prior_packets;
3585 int prior_sacked = tp->sacked_out; 3633 int frto_cwnd = 0;
3586 int pkts_acked = 0;
3587 bool frto_cwnd = false;
3588 3634
3589 /* If the ack is older than previous acks 3635 /* If the ack is older than previous acks
3590 * then we can probably ignore it. 3636 * then we can probably ignore it.
3591 */ 3637 */
3592 if (before(ack, prior_snd_una)) { 3638 if (before(ack, prior_snd_una))
3593 /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
3594 if (before(ack, prior_snd_una - tp->max_window)) {
3595 tcp_send_challenge_ack(sk);
3596 return -1;
3597 }
3598 goto old_ack; 3639 goto old_ack;
3599 }
3600 3640
3601 /* If the ack includes data we haven't sent yet, discard 3641 /* If the ack includes data we haven't sent yet, discard
3602 * this segment (RFC793 Section 3.9). 3642 * this segment (RFC793 Section 3.9).
@@ -3604,9 +3644,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3604 if (after(ack, tp->snd_nxt)) 3644 if (after(ack, tp->snd_nxt))
3605 goto invalid_ack; 3645 goto invalid_ack;
3606 3646
3607 if (tp->early_retrans_delayed)
3608 tcp_rearm_rto(sk);
3609
3610 if (after(ack, prior_snd_una)) 3647 if (after(ack, prior_snd_una))
3611 flag |= FLAG_SND_UNA_ADVANCED; 3648 flag |= FLAG_SND_UNA_ADVANCED;
3612 3649
@@ -3664,8 +3701,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3664 /* See if we can take anything off of the retransmit queue. */ 3701 /* See if we can take anything off of the retransmit queue. */
3665 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); 3702 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);
3666 3703
3667 pkts_acked = prior_packets - tp->packets_out;
3668
3669 if (tp->frto_counter) 3704 if (tp->frto_counter)
3670 frto_cwnd = tcp_process_frto(sk, flag); 3705 frto_cwnd = tcp_process_frto(sk, flag);
3671 /* Guarantee sacktag reordering detection against wrap-arounds */ 3706 /* Guarantee sacktag reordering detection against wrap-arounds */
@@ -3677,26 +3712,19 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3677 if ((flag & FLAG_DATA_ACKED) && !frto_cwnd && 3712 if ((flag & FLAG_DATA_ACKED) && !frto_cwnd &&
3678 tcp_may_raise_cwnd(sk, flag)) 3713 tcp_may_raise_cwnd(sk, flag))
3679 tcp_cong_avoid(sk, ack, prior_in_flight); 3714 tcp_cong_avoid(sk, ack, prior_in_flight);
3680 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); 3715 tcp_fastretrans_alert(sk, prior_packets - tp->packets_out,
3681 tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, 3716 flag);
3682 is_dupack, flag);
3683 } else { 3717 } else {
3684 if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) 3718 if ((flag & FLAG_DATA_ACKED) && !frto_cwnd)
3685 tcp_cong_avoid(sk, ack, prior_in_flight); 3719 tcp_cong_avoid(sk, ack, prior_in_flight);
3686 } 3720 }
3687 3721
3688 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { 3722 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
3689 struct dst_entry *dst = __sk_dst_get(sk); 3723 dst_confirm(__sk_dst_get(sk));
3690 if (dst) 3724
3691 dst_confirm(dst);
3692 }
3693 return 1; 3725 return 1;
3694 3726
3695no_queue: 3727no_queue:
3696 /* If data was DSACKed, see if we can undo a cwnd reduction. */
3697 if (flag & FLAG_DSACKING_ACK)
3698 tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
3699 is_dupack, flag);
3700 /* If this ack opens up a zero window, clear backoff. It was 3728 /* If this ack opens up a zero window, clear backoff. It was
3701 * being used to time the probes, and is probably far higher than 3729 * being used to time the probes, and is probably far higher than
3702 * it needs to be for normal retransmission. 3730 * it needs to be for normal retransmission.
@@ -3710,13 +3738,10 @@ invalid_ack:
3710 return -1; 3738 return -1;
3711 3739
3712old_ack: 3740old_ack:
3713 /* If data was SACKed, tag it and see if we should send more data.
3714 * If data was DSACKed, see if we can undo a cwnd reduction.
3715 */
3716 if (TCP_SKB_CB(skb)->sacked) { 3741 if (TCP_SKB_CB(skb)->sacked) {
3717 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); 3742 tcp_sacktag_write_queue(sk, skb, prior_snd_una);
3718 tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, 3743 if (icsk->icsk_ca_state == TCP_CA_Open)
3719 is_dupack, flag); 3744 tcp_try_keep_open(sk);
3720 } 3745 }
3721 3746
3722 SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt); 3747 SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
@@ -3727,15 +3752,14 @@ old_ack:
3727 * But, this can also be called on packets in the established flow when 3752 * But, this can also be called on packets in the established flow when
3728 * the fast version below fails. 3753 * the fast version below fails.
3729 */ 3754 */
3730void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx, 3755void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
3731 const u8 **hvpp, int estab, 3756 u8 **hvpp, int estab)
3732 struct tcp_fastopen_cookie *foc)
3733{ 3757{
3734 const unsigned char *ptr; 3758 unsigned char *ptr;
3735 const struct tcphdr *th = tcp_hdr(skb); 3759 struct tcphdr *th = tcp_hdr(skb);
3736 int length = (th->doff * 4) - sizeof(struct tcphdr); 3760 int length = (th->doff * 4) - sizeof(struct tcphdr);
3737 3761
3738 ptr = (const unsigned char *)(th + 1); 3762 ptr = (unsigned char *)(th + 1);
3739 opt_rx->saw_tstamp = 0; 3763 opt_rx->saw_tstamp = 0;
3740 3764
3741 while (length > 0) { 3765 while (length > 0) {
@@ -3772,9 +3796,10 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o
3772 __u8 snd_wscale = *(__u8 *)ptr; 3796 __u8 snd_wscale = *(__u8 *)ptr;
3773 opt_rx->wscale_ok = 1; 3797 opt_rx->wscale_ok = 1;
3774 if (snd_wscale > 14) { 3798 if (snd_wscale > 14) {
3775 net_info_ratelimited("%s: Illegal window scaling value %d >14 received\n", 3799 if (net_ratelimit())
3776 __func__, 3800 printk(KERN_INFO "tcp_parse_options: Illegal window "
3777 snd_wscale); 3801 "scaling value %d >14 received.\n",
3802 snd_wscale);
3778 snd_wscale = 14; 3803 snd_wscale = 14;
3779 } 3804 }
3780 opt_rx->snd_wscale = snd_wscale; 3805 opt_rx->snd_wscale = snd_wscale;
@@ -3792,7 +3817,7 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o
3792 case TCPOPT_SACK_PERM: 3817 case TCPOPT_SACK_PERM:
3793 if (opsize == TCPOLEN_SACK_PERM && th->syn && 3818 if (opsize == TCPOLEN_SACK_PERM && th->syn &&
3794 !estab && sysctl_tcp_sack) { 3819 !estab && sysctl_tcp_sack) {
3795 opt_rx->sack_ok = TCP_SACK_SEEN; 3820 opt_rx->sack_ok = 1;
3796 tcp_sack_reset(opt_rx); 3821 tcp_sack_reset(opt_rx);
3797 } 3822 }
3798 break; 3823 break;
@@ -3836,25 +3861,8 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o
3836 break; 3861 break;
3837 } 3862 }
3838 break; 3863 break;
3839
3840 case TCPOPT_EXP:
3841 /* Fast Open option shares code 254 using a
3842 * 16 bits magic number. It's valid only in
3843 * SYN or SYN-ACK with an even size.
3844 */
3845 if (opsize < TCPOLEN_EXP_FASTOPEN_BASE ||
3846 get_unaligned_be16(ptr) != TCPOPT_FASTOPEN_MAGIC ||
3847 foc == NULL || !th->syn || (opsize & 1))
3848 break;
3849 foc->len = opsize - TCPOLEN_EXP_FASTOPEN_BASE;
3850 if (foc->len >= TCP_FASTOPEN_COOKIE_MIN &&
3851 foc->len <= TCP_FASTOPEN_COOKIE_MAX)
3852 memcpy(foc->val, ptr + 2, foc->len);
3853 else if (foc->len != 0)
3854 foc->len = -1;
3855 break;
3856
3857 } 3864 }
3865
3858 ptr += opsize-2; 3866 ptr += opsize-2;
3859 length -= opsize; 3867 length -= opsize;
3860 } 3868 }
@@ -3862,9 +3870,9 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o
3862} 3870}
3863EXPORT_SYMBOL(tcp_parse_options); 3871EXPORT_SYMBOL(tcp_parse_options);
3864 3872
3865static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th) 3873static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th)
3866{ 3874{
3867 const __be32 *ptr = (const __be32 *)(th + 1); 3875 __be32 *ptr = (__be32 *)(th + 1);
3868 3876
3869 if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) 3877 if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
3870 | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { 3878 | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
@@ -3873,41 +3881,40 @@ static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr
3873 tp->rx_opt.rcv_tsval = ntohl(*ptr); 3881 tp->rx_opt.rcv_tsval = ntohl(*ptr);
3874 ++ptr; 3882 ++ptr;
3875 tp->rx_opt.rcv_tsecr = ntohl(*ptr); 3883 tp->rx_opt.rcv_tsecr = ntohl(*ptr);
3876 return true; 3884 return 1;
3877 } 3885 }
3878 return false; 3886 return 0;
3879} 3887}
3880 3888
3881/* Fast parse options. This hopes to only see timestamps. 3889/* Fast parse options. This hopes to only see timestamps.
3882 * If it is wrong it falls back on tcp_parse_options(). 3890 * If it is wrong it falls back on tcp_parse_options().
3883 */ 3891 */
3884static bool tcp_fast_parse_options(const struct sk_buff *skb, 3892static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
3885 const struct tcphdr *th, 3893 struct tcp_sock *tp, u8 **hvpp)
3886 struct tcp_sock *tp, const u8 **hvpp)
3887{ 3894{
3888 /* In the spirit of fast parsing, compare doff directly to constant 3895 /* In the spirit of fast parsing, compare doff directly to constant
3889 * values. Because equality is used, short doff can be ignored here. 3896 * values. Because equality is used, short doff can be ignored here.
3890 */ 3897 */
3891 if (th->doff == (sizeof(*th) / 4)) { 3898 if (th->doff == (sizeof(*th) / 4)) {
3892 tp->rx_opt.saw_tstamp = 0; 3899 tp->rx_opt.saw_tstamp = 0;
3893 return false; 3900 return 0;
3894 } else if (tp->rx_opt.tstamp_ok && 3901 } else if (tp->rx_opt.tstamp_ok &&
3895 th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) { 3902 th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
3896 if (tcp_parse_aligned_timestamp(tp, th)) 3903 if (tcp_parse_aligned_timestamp(tp, th))
3897 return true; 3904 return 1;
3898 } 3905 }
3899 tcp_parse_options(skb, &tp->rx_opt, hvpp, 1, NULL); 3906 tcp_parse_options(skb, &tp->rx_opt, hvpp, 1);
3900 return true; 3907 return 1;
3901} 3908}
3902 3909
3903#ifdef CONFIG_TCP_MD5SIG 3910#ifdef CONFIG_TCP_MD5SIG
3904/* 3911/*
3905 * Parse MD5 Signature option 3912 * Parse MD5 Signature option
3906 */ 3913 */
3907const u8 *tcp_parse_md5sig_option(const struct tcphdr *th) 3914u8 *tcp_parse_md5sig_option(struct tcphdr *th)
3908{ 3915{
3909 int length = (th->doff << 2) - sizeof(*th); 3916 int length = (th->doff << 2) - sizeof (*th);
3910 const u8 *ptr = (const u8 *)(th + 1); 3917 u8 *ptr = (u8*)(th + 1);
3911 3918
3912 /* If the TCP option is too short, we can short cut */ 3919 /* If the TCP option is too short, we can short cut */
3913 if (length < TCPOLEN_MD5SIG) 3920 if (length < TCPOLEN_MD5SIG)
@@ -3984,8 +3991,8 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
3984 3991
3985static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) 3992static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
3986{ 3993{
3987 const struct tcp_sock *tp = tcp_sk(sk); 3994 struct tcp_sock *tp = tcp_sk(sk);
3988 const struct tcphdr *th = tcp_hdr(skb); 3995 struct tcphdr *th = tcp_hdr(skb);
3989 u32 seq = TCP_SKB_CB(skb)->seq; 3996 u32 seq = TCP_SKB_CB(skb)->seq;
3990 u32 ack = TCP_SKB_CB(skb)->ack_seq; 3997 u32 ack = TCP_SKB_CB(skb)->ack_seq;
3991 3998
@@ -4002,7 +4009,7 @@ static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
4002 (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ); 4009 (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
4003} 4010}
4004 4011
4005static inline bool tcp_paws_discard(const struct sock *sk, 4012static inline int tcp_paws_discard(const struct sock *sk,
4006 const struct sk_buff *skb) 4013 const struct sk_buff *skb)
4007{ 4014{
4008 const struct tcp_sock *tp = tcp_sk(sk); 4015 const struct tcp_sock *tp = tcp_sk(sk);
@@ -4024,14 +4031,14 @@ static inline bool tcp_paws_discard(const struct sock *sk,
4024 * (borrowed from freebsd) 4031 * (borrowed from freebsd)
4025 */ 4032 */
4026 4033
4027static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) 4034static inline int tcp_sequence(struct tcp_sock *tp, u32 seq, u32 end_seq)
4028{ 4035{
4029 return !before(end_seq, tp->rcv_wup) && 4036 return !before(end_seq, tp->rcv_wup) &&
4030 !after(seq, tp->rcv_nxt + tcp_receive_window(tp)); 4037 !after(seq, tp->rcv_nxt + tcp_receive_window(tp));
4031} 4038}
4032 4039
4033/* When we get a reset we do this. */ 4040/* When we get a reset we do this. */
4034void tcp_reset(struct sock *sk) 4041static void tcp_reset(struct sock *sk)
4035{ 4042{
4036 /* We want the right error as BSD sees it (and indeed as we do). */ 4043 /* We want the right error as BSD sees it (and indeed as we do). */
4037 switch (sk->sk_state) { 4044 switch (sk->sk_state) {
@@ -4069,7 +4076,7 @@ void tcp_reset(struct sock *sk)
4069 * 4076 *
4070 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. 4077 * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
4071 */ 4078 */
4072static void tcp_fin(struct sock *sk) 4079static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
4073{ 4080{
4074 struct tcp_sock *tp = tcp_sk(sk); 4081 struct tcp_sock *tp = tcp_sk(sk);
4075 4082
@@ -4113,7 +4120,7 @@ static void tcp_fin(struct sock *sk)
4113 /* Only TCP_LISTEN and TCP_CLOSE are left, in these 4120 /* Only TCP_LISTEN and TCP_CLOSE are left, in these
4114 * cases we should never reach this piece of code. 4121 * cases we should never reach this piece of code.
4115 */ 4122 */
4116 pr_err("%s: Impossible, sk->sk_state=%d\n", 4123 printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n",
4117 __func__, sk->sk_state); 4124 __func__, sk->sk_state);
4118 break; 4125 break;
4119 } 4126 }
@@ -4138,7 +4145,7 @@ static void tcp_fin(struct sock *sk)
4138 } 4145 }
4139} 4146}
4140 4147
4141static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, 4148static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
4142 u32 end_seq) 4149 u32 end_seq)
4143{ 4150{
4144 if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) { 4151 if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
@@ -4146,9 +4153,9 @@ static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
4146 sp->start_seq = seq; 4153 sp->start_seq = seq;
4147 if (after(end_seq, sp->end_seq)) 4154 if (after(end_seq, sp->end_seq))
4148 sp->end_seq = end_seq; 4155 sp->end_seq = end_seq;
4149 return true; 4156 return 1;
4150 } 4157 }
4151 return false; 4158 return 0;
4152} 4159}
4153 4160
4154static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) 4161static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
@@ -4181,7 +4188,7 @@ static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
4181 tcp_sack_extend(tp->duplicate_sack, seq, end_seq); 4188 tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
4182} 4189}
4183 4190
4184static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) 4191static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb)
4185{ 4192{
4186 struct tcp_sock *tp = tcp_sk(sk); 4193 struct tcp_sock *tp = tcp_sk(sk);
4187 4194
@@ -4340,258 +4347,37 @@ static void tcp_ofo_queue(struct sock *sk)
4340 __skb_queue_tail(&sk->sk_receive_queue, skb); 4347 __skb_queue_tail(&sk->sk_receive_queue, skb);
4341 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 4348 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
4342 if (tcp_hdr(skb)->fin) 4349 if (tcp_hdr(skb)->fin)
4343 tcp_fin(sk); 4350 tcp_fin(skb, sk, tcp_hdr(skb));
4344 } 4351 }
4345} 4352}
4346 4353
4347static bool tcp_prune_ofo_queue(struct sock *sk); 4354static int tcp_prune_ofo_queue(struct sock *sk);
4348static int tcp_prune_queue(struct sock *sk); 4355static int tcp_prune_queue(struct sock *sk);
4349 4356
4350static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, 4357static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
4351 unsigned int size)
4352{ 4358{
4353 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || 4359 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
4354 !sk_rmem_schedule(sk, skb, size)) { 4360 !sk_rmem_schedule(sk, size)) {
4355 4361
4356 if (tcp_prune_queue(sk) < 0) 4362 if (tcp_prune_queue(sk) < 0)
4357 return -1; 4363 return -1;
4358 4364
4359 if (!sk_rmem_schedule(sk, skb, size)) { 4365 if (!sk_rmem_schedule(sk, size)) {
4360 if (!tcp_prune_ofo_queue(sk)) 4366 if (!tcp_prune_ofo_queue(sk))
4361 return -1; 4367 return -1;
4362 4368
4363 if (!sk_rmem_schedule(sk, skb, size)) 4369 if (!sk_rmem_schedule(sk, size))
4364 return -1; 4370 return -1;
4365 } 4371 }
4366 } 4372 }
4367 return 0; 4373 return 0;
4368} 4374}
4369 4375
4370/**
4371 * tcp_try_coalesce - try to merge skb to prior one
4372 * @sk: socket
4373 * @to: prior buffer
4374 * @from: buffer to add in queue
4375 * @fragstolen: pointer to boolean
4376 *
4377 * Before queueing skb @from after @to, try to merge them
4378 * to reduce overall memory use and queue lengths, if cost is small.
4379 * Packets in ofo or receive queues can stay a long time.
4380 * Better try to coalesce them right now to avoid future collapses.
4381 * Returns true if caller should free @from instead of queueing it
4382 */
4383static bool tcp_try_coalesce(struct sock *sk,
4384 struct sk_buff *to,
4385 struct sk_buff *from,
4386 bool *fragstolen)
4387{
4388 int delta;
4389
4390 *fragstolen = false;
4391
4392 if (tcp_hdr(from)->fin)
4393 return false;
4394
4395 /* Its possible this segment overlaps with prior segment in queue */
4396 if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
4397 return false;
4398
4399 if (!skb_try_coalesce(to, from, fragstolen, &delta))
4400 return false;
4401
4402 atomic_add(delta, &sk->sk_rmem_alloc);
4403 sk_mem_charge(sk, delta);
4404 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
4405 TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
4406 TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
4407 return true;
4408}
4409
4410static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4411{
4412 struct tcp_sock *tp = tcp_sk(sk);
4413 struct sk_buff *skb1;
4414 u32 seq, end_seq;
4415
4416 TCP_ECN_check_ce(tp, skb);
4417
4418 if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
4419 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
4420 __kfree_skb(skb);
4421 return;
4422 }
4423
4424 /* Disable header prediction. */
4425 tp->pred_flags = 0;
4426 inet_csk_schedule_ack(sk);
4427
4428 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
4429 SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
4430 tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
4431
4432 skb1 = skb_peek_tail(&tp->out_of_order_queue);
4433 if (!skb1) {
4434 /* Initial out of order segment, build 1 SACK. */
4435 if (tcp_is_sack(tp)) {
4436 tp->rx_opt.num_sacks = 1;
4437 tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
4438 tp->selective_acks[0].end_seq =
4439 TCP_SKB_CB(skb)->end_seq;
4440 }
4441 __skb_queue_head(&tp->out_of_order_queue, skb);
4442 goto end;
4443 }
4444
4445 seq = TCP_SKB_CB(skb)->seq;
4446 end_seq = TCP_SKB_CB(skb)->end_seq;
4447
4448 if (seq == TCP_SKB_CB(skb1)->end_seq) {
4449 bool fragstolen;
4450
4451 if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
4452 __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
4453 } else {
4454 kfree_skb_partial(skb, fragstolen);
4455 skb = NULL;
4456 }
4457
4458 if (!tp->rx_opt.num_sacks ||
4459 tp->selective_acks[0].end_seq != seq)
4460 goto add_sack;
4461
4462 /* Common case: data arrive in order after hole. */
4463 tp->selective_acks[0].end_seq = end_seq;
4464 goto end;
4465 }
4466
4467 /* Find place to insert this segment. */
4468 while (1) {
4469 if (!after(TCP_SKB_CB(skb1)->seq, seq))
4470 break;
4471 if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
4472 skb1 = NULL;
4473 break;
4474 }
4475 skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
4476 }
4477
4478 /* Do skb overlap to previous one? */
4479 if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
4480 if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
4481 /* All the bits are present. Drop. */
4482 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
4483 __kfree_skb(skb);
4484 skb = NULL;
4485 tcp_dsack_set(sk, seq, end_seq);
4486 goto add_sack;
4487 }
4488 if (after(seq, TCP_SKB_CB(skb1)->seq)) {
4489 /* Partial overlap. */
4490 tcp_dsack_set(sk, seq,
4491 TCP_SKB_CB(skb1)->end_seq);
4492 } else {
4493 if (skb_queue_is_first(&tp->out_of_order_queue,
4494 skb1))
4495 skb1 = NULL;
4496 else
4497 skb1 = skb_queue_prev(
4498 &tp->out_of_order_queue,
4499 skb1);
4500 }
4501 }
4502 if (!skb1)
4503 __skb_queue_head(&tp->out_of_order_queue, skb);
4504 else
4505 __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
4506
4507 /* And clean segments covered by new one as whole. */
4508 while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
4509 skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
4510
4511 if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
4512 break;
4513 if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
4514 tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
4515 end_seq);
4516 break;
4517 }
4518 __skb_unlink(skb1, &tp->out_of_order_queue);
4519 tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
4520 TCP_SKB_CB(skb1)->end_seq);
4521 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
4522 __kfree_skb(skb1);
4523 }
4524
4525add_sack:
4526 if (tcp_is_sack(tp))
4527 tcp_sack_new_ofo_skb(sk, seq, end_seq);
4528end:
4529 if (skb)
4530 skb_set_owner_r(skb, sk);
4531}
4532
4533static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
4534 bool *fragstolen)
4535{
4536 int eaten;
4537 struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
4538
4539 __skb_pull(skb, hdrlen);
4540 eaten = (tail &&
4541 tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0;
4542 tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
4543 if (!eaten) {
4544 __skb_queue_tail(&sk->sk_receive_queue, skb);
4545 skb_set_owner_r(skb, sk);
4546 }
4547 return eaten;
4548}
4549
4550int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
4551{
4552 struct sk_buff *skb = NULL;
4553 struct tcphdr *th;
4554 bool fragstolen;
4555
4556 if (size == 0)
4557 return 0;
4558
4559 skb = alloc_skb(size + sizeof(*th), sk->sk_allocation);
4560 if (!skb)
4561 goto err;
4562
4563 if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th)))
4564 goto err_free;
4565
4566 th = (struct tcphdr *)skb_put(skb, sizeof(*th));
4567 skb_reset_transport_header(skb);
4568 memset(th, 0, sizeof(*th));
4569
4570 if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size))
4571 goto err_free;
4572
4573 TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
4574 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
4575 TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
4576
4577 if (tcp_queue_rcv(sk, skb, sizeof(*th), &fragstolen)) {
4578 WARN_ON_ONCE(fragstolen); /* should not happen */
4579 __kfree_skb(skb);
4580 }
4581 return size;
4582
4583err_free:
4584 kfree_skb(skb);
4585err:
4586 return -ENOMEM;
4587}
4588
4589static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) 4376static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
4590{ 4377{
4591 const struct tcphdr *th = tcp_hdr(skb); 4378 struct tcphdr *th = tcp_hdr(skb);
4592 struct tcp_sock *tp = tcp_sk(sk); 4379 struct tcp_sock *tp = tcp_sk(sk);
4593 int eaten = -1; 4380 int eaten = -1;
4594 bool fragstolen = false;
4595 4381
4596 if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) 4382 if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
4597 goto drop; 4383 goto drop;
@@ -4633,16 +4419,17 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
4633 if (eaten <= 0) { 4419 if (eaten <= 0) {
4634queue_and_out: 4420queue_and_out:
4635 if (eaten < 0 && 4421 if (eaten < 0 &&
4636 tcp_try_rmem_schedule(sk, skb, skb->truesize)) 4422 tcp_try_rmem_schedule(sk, skb->truesize))
4637 goto drop; 4423 goto drop;
4638 4424
4639 eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); 4425 skb_set_owner_r(skb, sk);
4426 __skb_queue_tail(&sk->sk_receive_queue, skb);
4640 } 4427 }
4641 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 4428 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
4642 if (skb->len) 4429 if (skb->len)
4643 tcp_event_data_recv(sk, skb); 4430 tcp_event_data_recv(sk, skb);
4644 if (th->fin) 4431 if (th->fin)
4645 tcp_fin(sk); 4432 tcp_fin(skb, sk, th);
4646 4433
4647 if (!skb_queue_empty(&tp->out_of_order_queue)) { 4434 if (!skb_queue_empty(&tp->out_of_order_queue)) {
4648 tcp_ofo_queue(sk); 4435 tcp_ofo_queue(sk);
@@ -4660,8 +4447,8 @@ queue_and_out:
4660 tcp_fast_path_check(sk); 4447 tcp_fast_path_check(sk);
4661 4448
4662 if (eaten > 0) 4449 if (eaten > 0)
4663 kfree_skb_partial(skb, fragstolen); 4450 __kfree_skb(skb);
4664 if (!sock_flag(sk, SOCK_DEAD)) 4451 else if (!sock_flag(sk, SOCK_DEAD))
4665 sk->sk_data_ready(sk, 0); 4452 sk->sk_data_ready(sk, 0);
4666 return; 4453 return;
4667 } 4454 }
@@ -4701,7 +4488,105 @@ drop:
4701 goto queue_and_out; 4488 goto queue_and_out;
4702 } 4489 }
4703 4490
4704 tcp_data_queue_ofo(sk, skb); 4491 TCP_ECN_check_ce(tp, skb);
4492
4493 if (tcp_try_rmem_schedule(sk, skb->truesize))
4494 goto drop;
4495
4496 /* Disable header prediction. */
4497 tp->pred_flags = 0;
4498 inet_csk_schedule_ack(sk);
4499
4500 SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
4501 tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
4502
4503 skb_set_owner_r(skb, sk);
4504
4505 if (!skb_peek(&tp->out_of_order_queue)) {
4506 /* Initial out of order segment, build 1 SACK. */
4507 if (tcp_is_sack(tp)) {
4508 tp->rx_opt.num_sacks = 1;
4509 tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
4510 tp->selective_acks[0].end_seq =
4511 TCP_SKB_CB(skb)->end_seq;
4512 }
4513 __skb_queue_head(&tp->out_of_order_queue, skb);
4514 } else {
4515 struct sk_buff *skb1 = skb_peek_tail(&tp->out_of_order_queue);
4516 u32 seq = TCP_SKB_CB(skb)->seq;
4517 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
4518
4519 if (seq == TCP_SKB_CB(skb1)->end_seq) {
4520 __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
4521
4522 if (!tp->rx_opt.num_sacks ||
4523 tp->selective_acks[0].end_seq != seq)
4524 goto add_sack;
4525
4526 /* Common case: data arrive in order after hole. */
4527 tp->selective_acks[0].end_seq = end_seq;
4528 return;
4529 }
4530
4531 /* Find place to insert this segment. */
4532 while (1) {
4533 if (!after(TCP_SKB_CB(skb1)->seq, seq))
4534 break;
4535 if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
4536 skb1 = NULL;
4537 break;
4538 }
4539 skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
4540 }
4541
4542 /* Do skb overlap to previous one? */
4543 if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
4544 if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
4545 /* All the bits are present. Drop. */
4546 __kfree_skb(skb);
4547 tcp_dsack_set(sk, seq, end_seq);
4548 goto add_sack;
4549 }
4550 if (after(seq, TCP_SKB_CB(skb1)->seq)) {
4551 /* Partial overlap. */
4552 tcp_dsack_set(sk, seq,
4553 TCP_SKB_CB(skb1)->end_seq);
4554 } else {
4555 if (skb_queue_is_first(&tp->out_of_order_queue,
4556 skb1))
4557 skb1 = NULL;
4558 else
4559 skb1 = skb_queue_prev(
4560 &tp->out_of_order_queue,
4561 skb1);
4562 }
4563 }
4564 if (!skb1)
4565 __skb_queue_head(&tp->out_of_order_queue, skb);
4566 else
4567 __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
4568
4569 /* And clean segments covered by new one as whole. */
4570 while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
4571 skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
4572
4573 if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
4574 break;
4575 if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
4576 tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
4577 end_seq);
4578 break;
4579 }
4580 __skb_unlink(skb1, &tp->out_of_order_queue);
4581 tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
4582 TCP_SKB_CB(skb1)->end_seq);
4583 __kfree_skb(skb1);
4584 }
4585
4586add_sack:
4587 if (tcp_is_sack(tp))
4588 tcp_sack_new_ofo_skb(sk, seq, end_seq);
4589 }
4705} 4590}
4706 4591
4707static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, 4592static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
@@ -4880,10 +4765,10 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
4880 * Purge the out-of-order queue. 4765 * Purge the out-of-order queue.
4881 * Return true if queue was pruned. 4766 * Return true if queue was pruned.
4882 */ 4767 */
4883static bool tcp_prune_ofo_queue(struct sock *sk) 4768static int tcp_prune_ofo_queue(struct sock *sk)
4884{ 4769{
4885 struct tcp_sock *tp = tcp_sk(sk); 4770 struct tcp_sock *tp = tcp_sk(sk);
4886 bool res = false; 4771 int res = 0;
4887 4772
4888 if (!skb_queue_empty(&tp->out_of_order_queue)) { 4773 if (!skb_queue_empty(&tp->out_of_order_queue)) {
4889 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED); 4774 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
@@ -4897,7 +4782,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
4897 if (tp->rx_opt.sack_ok) 4782 if (tp->rx_opt.sack_ok)
4898 tcp_sack_reset(&tp->rx_opt); 4783 tcp_sack_reset(&tp->rx_opt);
4899 sk_mem_reclaim(sk); 4784 sk_mem_reclaim(sk);
4900 res = true; 4785 res = 1;
4901 } 4786 }
4902 return res; 4787 return res;
4903} 4788}
@@ -4919,7 +4804,7 @@ static int tcp_prune_queue(struct sock *sk)
4919 4804
4920 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) 4805 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
4921 tcp_clamp_window(sk); 4806 tcp_clamp_window(sk);
4922 else if (sk_under_memory_pressure(sk)) 4807 else if (tcp_memory_pressure)
4923 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); 4808 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
4924 4809
4925 tcp_collapse_ofo_queue(sk); 4810 tcp_collapse_ofo_queue(sk);
@@ -4974,29 +4859,29 @@ void tcp_cwnd_application_limited(struct sock *sk)
4974 tp->snd_cwnd_stamp = tcp_time_stamp; 4859 tp->snd_cwnd_stamp = tcp_time_stamp;
4975} 4860}
4976 4861
4977static bool tcp_should_expand_sndbuf(const struct sock *sk) 4862static int tcp_should_expand_sndbuf(struct sock *sk)
4978{ 4863{
4979 const struct tcp_sock *tp = tcp_sk(sk); 4864 struct tcp_sock *tp = tcp_sk(sk);
4980 4865
4981 /* If the user specified a specific send buffer setting, do 4866 /* If the user specified a specific send buffer setting, do
4982 * not modify it. 4867 * not modify it.
4983 */ 4868 */
4984 if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) 4869 if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
4985 return false; 4870 return 0;
4986 4871
4987 /* If we are under global TCP memory pressure, do not expand. */ 4872 /* If we are under global TCP memory pressure, do not expand. */
4988 if (sk_under_memory_pressure(sk)) 4873 if (tcp_memory_pressure)
4989 return false; 4874 return 0;
4990 4875
4991 /* If we are under soft global TCP memory pressure, do not expand. */ 4876 /* If we are under soft global TCP memory pressure, do not expand. */
4992 if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0)) 4877 if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
4993 return false; 4878 return 0;
4994 4879
4995 /* If we filled the congestion window, do not expand. */ 4880 /* If we filled the congestion window, do not expand. */
4996 if (tp->packets_out >= tp->snd_cwnd) 4881 if (tp->packets_out >= tp->snd_cwnd)
4997 return false; 4882 return 0;
4998 4883
4999 return true; 4884 return 1;
5000} 4885}
5001 4886
5002/* When incoming ACK allowed to free some skb from write_queue, 4887/* When incoming ACK allowed to free some skb from write_queue,
@@ -5010,10 +4895,8 @@ static void tcp_new_space(struct sock *sk)
5010 struct tcp_sock *tp = tcp_sk(sk); 4895 struct tcp_sock *tp = tcp_sk(sk);
5011 4896
5012 if (tcp_should_expand_sndbuf(sk)) { 4897 if (tcp_should_expand_sndbuf(sk)) {
5013 int sndmem = SKB_TRUESIZE(max_t(u32, 4898 int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
5014 tp->rx_opt.mss_clamp, 4899 MAX_TCP_HEADER + 16 + sizeof(struct sk_buff);
5015 tp->mss_cache) +
5016 MAX_TCP_HEADER);
5017 int demanded = max_t(unsigned int, tp->snd_cwnd, 4900 int demanded = max_t(unsigned int, tp->snd_cwnd,
5018 tp->reordering + 1); 4901 tp->reordering + 1);
5019 sndmem *= 2 * demanded; 4902 sndmem *= 2 * demanded;
@@ -5085,7 +4968,7 @@ static inline void tcp_ack_snd_check(struct sock *sk)
5085 * either form (or just set the sysctl tcp_stdurg). 4968 * either form (or just set the sysctl tcp_stdurg).
5086 */ 4969 */
5087 4970
5088static void tcp_check_urg(struct sock *sk, const struct tcphdr *th) 4971static void tcp_check_urg(struct sock *sk, struct tcphdr *th)
5089{ 4972{
5090 struct tcp_sock *tp = tcp_sk(sk); 4973 struct tcp_sock *tp = tcp_sk(sk);
5091 u32 ptr = ntohs(th->urg_ptr); 4974 u32 ptr = ntohs(th->urg_ptr);
@@ -5151,7 +5034,7 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
5151} 5034}
5152 5035
5153/* This is the 'fast' part of urgent handling. */ 5036/* This is the 'fast' part of urgent handling. */
5154static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th) 5037static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th)
5155{ 5038{
5156 struct tcp_sock *tp = tcp_sk(sk); 5039 struct tcp_sock *tp = tcp_sk(sk);
5157 5040
@@ -5214,7 +5097,7 @@ static __sum16 __tcp_checksum_complete_user(struct sock *sk,
5214 return result; 5097 return result;
5215} 5098}
5216 5099
5217static inline bool tcp_checksum_complete_user(struct sock *sk, 5100static inline int tcp_checksum_complete_user(struct sock *sk,
5218 struct sk_buff *skb) 5101 struct sk_buff *skb)
5219{ 5102{
5220 return !skb_csum_unnecessary(skb) && 5103 return !skb_csum_unnecessary(skb) &&
@@ -5222,19 +5105,19 @@ static inline bool tcp_checksum_complete_user(struct sock *sk,
5222} 5105}
5223 5106
5224#ifdef CONFIG_NET_DMA 5107#ifdef CONFIG_NET_DMA
5225static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, 5108static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
5226 int hlen) 5109 int hlen)
5227{ 5110{
5228 struct tcp_sock *tp = tcp_sk(sk); 5111 struct tcp_sock *tp = tcp_sk(sk);
5229 int chunk = skb->len - hlen; 5112 int chunk = skb->len - hlen;
5230 int dma_cookie; 5113 int dma_cookie;
5231 bool copied_early = false; 5114 int copied_early = 0;
5232 5115
5233 if (tp->ucopy.wakeup) 5116 if (tp->ucopy.wakeup)
5234 return false; 5117 return 0;
5235 5118
5236 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) 5119 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
5237 tp->ucopy.dma_chan = net_dma_find_channel(); 5120 tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
5238 5121
5239 if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) { 5122 if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) {
5240 5123
@@ -5247,7 +5130,7 @@ static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
5247 goto out; 5130 goto out;
5248 5131
5249 tp->ucopy.dma_cookie = dma_cookie; 5132 tp->ucopy.dma_cookie = dma_cookie;
5250 copied_early = true; 5133 copied_early = 1;
5251 5134
5252 tp->ucopy.len -= chunk; 5135 tp->ucopy.len -= chunk;
5253 tp->copied_seq += chunk; 5136 tp->copied_seq += chunk;
@@ -5271,10 +5154,10 @@ out:
5271/* Does PAWS and seqno based validation of an incoming segment, flags will 5154/* Does PAWS and seqno based validation of an incoming segment, flags will
5272 * play significant role here. 5155 * play significant role here.
5273 */ 5156 */
5274static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, 5157static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
5275 const struct tcphdr *th, int syn_inerr) 5158 struct tcphdr *th, int syn_inerr)
5276{ 5159{
5277 const u8 *hash_location; 5160 u8 *hash_location;
5278 struct tcp_sock *tp = tcp_sk(sk); 5161 struct tcp_sock *tp = tcp_sk(sk);
5279 5162
5280 /* RFC1323: H1. Apply PAWS check first. */ 5163 /* RFC1323: H1. Apply PAWS check first. */
@@ -5297,48 +5180,38 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
5297 * an acknowledgment should be sent in reply (unless the RST 5180 * an acknowledgment should be sent in reply (unless the RST
5298 * bit is set, if so drop the segment and return)". 5181 * bit is set, if so drop the segment and return)".
5299 */ 5182 */
5300 if (!th->rst) { 5183 if (!th->rst)
5301 if (th->syn)
5302 goto syn_challenge;
5303 tcp_send_dupack(sk, skb); 5184 tcp_send_dupack(sk, skb);
5304 }
5305 goto discard; 5185 goto discard;
5306 } 5186 }
5307 5187
5308 /* Step 2: check RST bit */ 5188 /* Step 2: check RST bit */
5309 if (th->rst) { 5189 if (th->rst) {
5310 /* RFC 5961 3.2 : 5190 tcp_reset(sk);
5311 * If sequence number exactly matches RCV.NXT, then
5312 * RESET the connection
5313 * else
5314 * Send a challenge ACK
5315 */
5316 if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt)
5317 tcp_reset(sk);
5318 else
5319 tcp_send_challenge_ack(sk);
5320 goto discard; 5191 goto discard;
5321 } 5192 }
5322 5193
5194 /* ts_recent update must be made after we are sure that the packet
5195 * is in window.
5196 */
5197 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
5198
5323 /* step 3: check security and precedence [ignored] */ 5199 /* step 3: check security and precedence [ignored] */
5324 5200
5325 /* step 4: Check for a SYN 5201 /* step 4: Check for a SYN in window. */
5326 * RFC 5691 4.2 : Send a challenge ack 5202 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
5327 */
5328 if (th->syn) {
5329syn_challenge:
5330 if (syn_inerr) 5203 if (syn_inerr)
5331 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); 5204 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
5332 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); 5205 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN);
5333 tcp_send_challenge_ack(sk); 5206 tcp_reset(sk);
5334 goto discard; 5207 return -1;
5335 } 5208 }
5336 5209
5337 return true; 5210 return 1;
5338 5211
5339discard: 5212discard:
5340 __kfree_skb(skb); 5213 __kfree_skb(skb);
5341 return false; 5214 return 0;
5342} 5215}
5343 5216
5344/* 5217/*
@@ -5365,12 +5238,11 @@ discard:
5365 * tcp_data_queue when everything is OK. 5238 * tcp_data_queue when everything is OK.
5366 */ 5239 */
5367int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, 5240int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5368 const struct tcphdr *th, unsigned int len) 5241 struct tcphdr *th, unsigned len)
5369{ 5242{
5370 struct tcp_sock *tp = tcp_sk(sk); 5243 struct tcp_sock *tp = tcp_sk(sk);
5244 int res;
5371 5245
5372 if (unlikely(sk->sk_rx_dst == NULL))
5373 inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
5374 /* 5246 /*
5375 * Header prediction. 5247 * Header prediction.
5376 * The code loosely follows the one in the famous 5248 * The code loosely follows the one in the famous
@@ -5450,14 +5322,11 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5450 } else { 5322 } else {
5451 int eaten = 0; 5323 int eaten = 0;
5452 int copied_early = 0; 5324 int copied_early = 0;
5453 bool fragstolen = false;
5454 5325
5455 if (tp->copied_seq == tp->rcv_nxt && 5326 if (tp->copied_seq == tp->rcv_nxt &&
5456 len - tcp_header_len <= tp->ucopy.len) { 5327 len - tcp_header_len <= tp->ucopy.len) {
5457#ifdef CONFIG_NET_DMA 5328#ifdef CONFIG_NET_DMA
5458 if (tp->ucopy.task == current && 5329 if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
5459 sock_owned_by_user(sk) &&
5460 tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
5461 copied_early = 1; 5330 copied_early = 1;
5462 eaten = 1; 5331 eaten = 1;
5463 } 5332 }
@@ -5510,8 +5379,10 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5510 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS); 5379 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);
5511 5380
5512 /* Bulk data transfer: receiver */ 5381 /* Bulk data transfer: receiver */
5513 eaten = tcp_queue_rcv(sk, skb, tcp_header_len, 5382 __skb_pull(skb, tcp_header_len);
5514 &fragstolen); 5383 __skb_queue_tail(&sk->sk_receive_queue, skb);
5384 skb_set_owner_r(skb, sk);
5385 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
5515 } 5386 }
5516 5387
5517 tcp_event_data_recv(sk, skb); 5388 tcp_event_data_recv(sk, skb);
@@ -5533,8 +5404,9 @@ no_ack:
5533 else 5404 else
5534#endif 5405#endif
5535 if (eaten) 5406 if (eaten)
5536 kfree_skb_partial(skb, fragstolen); 5407 __kfree_skb(skb);
5537 sk->sk_data_ready(sk, 0); 5408 else
5409 sk->sk_data_ready(sk, 0);
5538 return 0; 5410 return 0;
5539 } 5411 }
5540 } 5412 }
@@ -5543,25 +5415,18 @@ slow_path:
5543 if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb)) 5415 if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb))
5544 goto csum_error; 5416 goto csum_error;
5545 5417
5546 if (!th->ack && !th->rst)
5547 goto discard;
5548
5549 /* 5418 /*
5550 * Standard slow path. 5419 * Standard slow path.
5551 */ 5420 */
5552 5421
5553 if (!tcp_validate_incoming(sk, skb, th, 1)) 5422 res = tcp_validate_incoming(sk, skb, th, 1);
5554 return 0; 5423 if (res <= 0)
5424 return -res;
5555 5425
5556step5: 5426step5:
5557 if (tcp_ack(sk, skb, FLAG_SLOWPATH) < 0) 5427 if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0)
5558 goto discard; 5428 goto discard;
5559 5429
5560 /* ts_recent update must be made after we are sure that the packet
5561 * is in window.
5562 */
5563 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
5564
5565 tcp_rcv_rtt_measure_ts(sk, skb); 5430 tcp_rcv_rtt_measure_ts(sk, skb);
5566 5431
5567 /* Process urgent data. */ 5432 /* Process urgent data. */
@@ -5583,101 +5448,16 @@ discard:
5583} 5448}
5584EXPORT_SYMBOL(tcp_rcv_established); 5449EXPORT_SYMBOL(tcp_rcv_established);
5585 5450
5586void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
5587{
5588 struct tcp_sock *tp = tcp_sk(sk);
5589 struct inet_connection_sock *icsk = inet_csk(sk);
5590
5591 tcp_set_state(sk, TCP_ESTABLISHED);
5592
5593 if (skb != NULL) {
5594 icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
5595 security_inet_conn_established(sk, skb);
5596 }
5597
5598 /* Make sure socket is routed, for correct metrics. */
5599 icsk->icsk_af_ops->rebuild_header(sk);
5600
5601 tcp_init_metrics(sk);
5602
5603 tcp_init_congestion_control(sk);
5604
5605 /* Prevent spurious tcp_cwnd_restart() on first data
5606 * packet.
5607 */
5608 tp->lsndtime = tcp_time_stamp;
5609
5610 tcp_init_buffer_space(sk);
5611
5612 if (sock_flag(sk, SOCK_KEEPOPEN))
5613 inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
5614
5615 if (!tp->rx_opt.snd_wscale)
5616 __tcp_fast_path_on(tp, tp->snd_wnd);
5617 else
5618 tp->pred_flags = 0;
5619
5620 if (!sock_flag(sk, SOCK_DEAD)) {
5621 sk->sk_state_change(sk);
5622 sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
5623 }
5624}
5625
5626static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
5627 struct tcp_fastopen_cookie *cookie)
5628{
5629 struct tcp_sock *tp = tcp_sk(sk);
5630 struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL;
5631 u16 mss = tp->rx_opt.mss_clamp;
5632 bool syn_drop;
5633
5634 if (mss == tp->rx_opt.user_mss) {
5635 struct tcp_options_received opt;
5636 const u8 *hash_location;
5637
5638 /* Get original SYNACK MSS value if user MSS sets mss_clamp */
5639 tcp_clear_options(&opt);
5640 opt.user_mss = opt.mss_clamp = 0;
5641 tcp_parse_options(synack, &opt, &hash_location, 0, NULL);
5642 mss = opt.mss_clamp;
5643 }
5644
5645 if (!tp->syn_fastopen) /* Ignore an unsolicited cookie */
5646 cookie->len = -1;
5647
5648 /* The SYN-ACK neither has cookie nor acknowledges the data. Presumably
5649 * the remote receives only the retransmitted (regular) SYNs: either
5650 * the original SYN-data or the corresponding SYN-ACK is lost.
5651 */
5652 syn_drop = (cookie->len <= 0 && data &&
5653 inet_csk(sk)->icsk_retransmits);
5654
5655 tcp_fastopen_cache_set(sk, mss, cookie, syn_drop);
5656
5657 if (data) { /* Retransmit unacked data in SYN */
5658 tcp_for_write_queue_from(data, sk) {
5659 if (data == tcp_send_head(sk) ||
5660 __tcp_retransmit_skb(sk, data))
5661 break;
5662 }
5663 tcp_rearm_rto(sk);
5664 return true;
5665 }
5666 tp->syn_data_acked = tp->syn_data;
5667 return false;
5668}
5669
5670static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, 5451static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5671 const struct tcphdr *th, unsigned int len) 5452 struct tcphdr *th, unsigned len)
5672{ 5453{
5673 const u8 *hash_location; 5454 u8 *hash_location;
5674 struct inet_connection_sock *icsk = inet_csk(sk); 5455 struct inet_connection_sock *icsk = inet_csk(sk);
5675 struct tcp_sock *tp = tcp_sk(sk); 5456 struct tcp_sock *tp = tcp_sk(sk);
5676 struct tcp_cookie_values *cvp = tp->cookie_values; 5457 struct tcp_cookie_values *cvp = tp->cookie_values;
5677 struct tcp_fastopen_cookie foc = { .len = -1 };
5678 int saved_clamp = tp->rx_opt.mss_clamp; 5458 int saved_clamp = tp->rx_opt.mss_clamp;
5679 5459
5680 tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, &foc); 5460 tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0);
5681 5461
5682 if (th->ack) { 5462 if (th->ack) {
5683 /* rfc793: 5463 /* rfc793:
@@ -5687,9 +5467,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5687 * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send 5467 * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
5688 * a reset (unless the RST bit is set, if so drop 5468 * a reset (unless the RST bit is set, if so drop
5689 * the segment and return)" 5469 * the segment and return)"
5470 *
5471 * We do not send data with SYN, so that RFC-correct
5472 * test reduces to:
5690 */ 5473 */
5691 if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) || 5474 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
5692 after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))
5693 goto reset_and_undo; 5475 goto reset_and_undo;
5694 5476
5695 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && 5477 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
@@ -5731,7 +5513,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5731 5513
5732 TCP_ECN_rcv_synack(tp, th); 5514 TCP_ECN_rcv_synack(tp, th);
5733 5515
5734 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); 5516 tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
5735 tcp_ack(sk, skb, FLAG_SLOWPATH); 5517 tcp_ack(sk, skb, FLAG_SLOWPATH);
5736 5518
5737 /* Ok.. it's good. Set up sequence numbers and 5519 /* Ok.. it's good. Set up sequence numbers and
@@ -5744,6 +5526,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5744 * never scaled. 5526 * never scaled.
5745 */ 5527 */
5746 tp->snd_wnd = ntohs(th->window); 5528 tp->snd_wnd = ntohs(th->window);
5529 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
5747 5530
5748 if (!tp->rx_opt.wscale_ok) { 5531 if (!tp->rx_opt.wscale_ok) {
5749 tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0; 5532 tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
@@ -5797,12 +5580,36 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5797 } 5580 }
5798 5581
5799 smp_mb(); 5582 smp_mb();
5583 tcp_set_state(sk, TCP_ESTABLISHED);
5800 5584
5801 tcp_finish_connect(sk, skb); 5585 security_inet_conn_established(sk, skb);
5802 5586
5803 if ((tp->syn_fastopen || tp->syn_data) && 5587 /* Make sure socket is routed, for correct metrics. */
5804 tcp_rcv_fastopen_synack(sk, skb, &foc)) 5588 icsk->icsk_af_ops->rebuild_header(sk);
5805 return -1; 5589
5590 tcp_init_metrics(sk);
5591
5592 tcp_init_congestion_control(sk);
5593
5594 /* Prevent spurious tcp_cwnd_restart() on first data
5595 * packet.
5596 */
5597 tp->lsndtime = tcp_time_stamp;
5598
5599 tcp_init_buffer_space(sk);
5600
5601 if (sock_flag(sk, SOCK_KEEPOPEN))
5602 inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
5603
5604 if (!tp->rx_opt.snd_wscale)
5605 __tcp_fast_path_on(tp, tp->snd_wnd);
5606 else
5607 tp->pred_flags = 0;
5608
5609 if (!sock_flag(sk, SOCK_DEAD)) {
5610 sk->sk_state_change(sk);
5611 sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
5612 }
5806 5613
5807 if (sk->sk_write_pending || 5614 if (sk->sk_write_pending ||
5808 icsk->icsk_accept_queue.rskq_defer_accept || 5615 icsk->icsk_accept_queue.rskq_defer_accept ||
@@ -5816,6 +5623,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5816 */ 5623 */
5817 inet_csk_schedule_ack(sk); 5624 inet_csk_schedule_ack(sk);
5818 icsk->icsk_ack.lrcvtime = tcp_time_stamp; 5625 icsk->icsk_ack.lrcvtime = tcp_time_stamp;
5626 icsk->icsk_ack.ato = TCP_ATO_MIN;
5627 tcp_incr_quickack(sk);
5819 tcp_enter_quickack_mode(sk); 5628 tcp_enter_quickack_mode(sk);
5820 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, 5629 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
5821 TCP_DELACK_MAX, TCP_RTO_MAX); 5630 TCP_DELACK_MAX, TCP_RTO_MAX);
@@ -5881,9 +5690,7 @@ discard:
5881 tcp_send_synack(sk); 5690 tcp_send_synack(sk);
5882#if 0 5691#if 0
5883 /* Note, we could accept data and URG from this segment. 5692 /* Note, we could accept data and URG from this segment.
5884 * There are no obstacles to make this (except that we must 5693 * There are no obstacles to make this.
5885 * either change tcp_recvmsg() to prevent it from returning data
5886 * before 3WHS completes per RFC793, or employ TCP Fast Open).
5887 * 5694 *
5888 * However, if we ignore data in ACKless segments sometimes, 5695 * However, if we ignore data in ACKless segments sometimes,
5889 * we have no reasons to accept it sometimes. 5696 * we have no reasons to accept it sometimes.
@@ -5919,12 +5726,12 @@ reset_and_undo:
5919 */ 5726 */
5920 5727
5921int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, 5728int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5922 const struct tcphdr *th, unsigned int len) 5729 struct tcphdr *th, unsigned len)
5923{ 5730{
5924 struct tcp_sock *tp = tcp_sk(sk); 5731 struct tcp_sock *tp = tcp_sk(sk);
5925 struct inet_connection_sock *icsk = inet_csk(sk); 5732 struct inet_connection_sock *icsk = inet_csk(sk);
5926 struct request_sock *req;
5927 int queued = 0; 5733 int queued = 0;
5734 int res;
5928 5735
5929 tp->rx_opt.saw_tstamp = 0; 5736 tp->rx_opt.saw_tstamp = 0;
5930 5737
@@ -5940,8 +5747,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5940 goto discard; 5747 goto discard;
5941 5748
5942 if (th->syn) { 5749 if (th->syn) {
5943 if (th->fin)
5944 goto discard;
5945 if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) 5750 if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
5946 return 1; 5751 return 1;
5947 5752
@@ -5979,47 +5784,18 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5979 return 0; 5784 return 0;
5980 } 5785 }
5981 5786
5982 req = tp->fastopen_rsk; 5787 res = tcp_validate_incoming(sk, skb, th, 0);
5983 if (req != NULL) { 5788 if (res <= 0)
5984 WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && 5789 return -res;
5985 sk->sk_state != TCP_FIN_WAIT1);
5986
5987 if (tcp_check_req(sk, skb, req, NULL, true) == NULL)
5988 goto discard;
5989 }
5990
5991 if (!th->ack && !th->rst)
5992 goto discard;
5993
5994 if (!tcp_validate_incoming(sk, skb, th, 0))
5995 return 0;
5996 5790
5997 /* step 5: check the ACK field */ 5791 /* step 5: check the ACK field */
5998 if (true) { 5792 if (th->ack) {
5999 int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0; 5793 int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0;
6000 5794
6001 switch (sk->sk_state) { 5795 switch (sk->sk_state) {
6002 case TCP_SYN_RECV: 5796 case TCP_SYN_RECV:
6003 if (acceptable) { 5797 if (acceptable) {
6004 /* Once we leave TCP_SYN_RECV, we no longer 5798 tp->copied_seq = tp->rcv_nxt;
6005 * need req so release it.
6006 */
6007 if (req) {
6008 tcp_synack_rtt_meas(sk, req);
6009 tp->total_retrans = req->num_retrans;
6010
6011 reqsk_fastopen_remove(sk, req, false);
6012 } else {
6013 /* Make sure socket is routed, for
6014 * correct metrics.
6015 */
6016 icsk->icsk_af_ops->rebuild_header(sk);
6017 tcp_init_congestion_control(sk);
6018
6019 tcp_mtup_init(sk);
6020 tcp_init_buffer_space(sk);
6021 tp->copied_seq = tp->rcv_nxt;
6022 }
6023 smp_mb(); 5799 smp_mb();
6024 tcp_set_state(sk, TCP_ESTABLISHED); 5800 tcp_set_state(sk, TCP_ESTABLISHED);
6025 sk->sk_state_change(sk); 5801 sk->sk_state_change(sk);
@@ -6041,27 +5817,23 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
6041 if (tp->rx_opt.tstamp_ok) 5817 if (tp->rx_opt.tstamp_ok)
6042 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 5818 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
6043 5819
6044 if (req) { 5820 /* Make sure socket is routed, for
6045 /* Re-arm the timer because data may 5821 * correct metrics.
6046 * have been sent out. This is similar 5822 */
6047 * to the regular data transmission case 5823 icsk->icsk_af_ops->rebuild_header(sk);
6048 * when new data has just been ack'ed. 5824
6049 * 5825 tcp_init_metrics(sk);
6050 * (TFO) - we could try to be more 5826
6051 * aggressive and retranmitting any data 5827 tcp_init_congestion_control(sk);
6052 * sooner based on when they were sent
6053 * out.
6054 */
6055 tcp_rearm_rto(sk);
6056 } else
6057 tcp_init_metrics(sk);
6058 5828
6059 /* Prevent spurious tcp_cwnd_restart() on 5829 /* Prevent spurious tcp_cwnd_restart() on
6060 * first data packet. 5830 * first data packet.
6061 */ 5831 */
6062 tp->lsndtime = tcp_time_stamp; 5832 tp->lsndtime = tcp_time_stamp;
6063 5833
5834 tcp_mtup_init(sk);
6064 tcp_initialize_rcv_mss(sk); 5835 tcp_initialize_rcv_mss(sk);
5836 tcp_init_buffer_space(sk);
6065 tcp_fast_path_on(tp); 5837 tcp_fast_path_on(tp);
6066 } else { 5838 } else {
6067 return 1; 5839 return 1;
@@ -6069,33 +5841,10 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
6069 break; 5841 break;
6070 5842
6071 case TCP_FIN_WAIT1: 5843 case TCP_FIN_WAIT1:
6072 /* If we enter the TCP_FIN_WAIT1 state and we are a
6073 * Fast Open socket and this is the first acceptable
6074 * ACK we have received, this would have acknowledged
6075 * our SYNACK so stop the SYNACK timer.
6076 */
6077 if (req != NULL) {
6078 /* Return RST if ack_seq is invalid.
6079 * Note that RFC793 only says to generate a
6080 * DUPACK for it but for TCP Fast Open it seems
6081 * better to treat this case like TCP_SYN_RECV
6082 * above.
6083 */
6084 if (!acceptable)
6085 return 1;
6086 /* We no longer need the request sock. */
6087 reqsk_fastopen_remove(sk, req, false);
6088 tcp_rearm_rto(sk);
6089 }
6090 if (tp->snd_una == tp->write_seq) { 5844 if (tp->snd_una == tp->write_seq) {
6091 struct dst_entry *dst;
6092
6093 tcp_set_state(sk, TCP_FIN_WAIT2); 5845 tcp_set_state(sk, TCP_FIN_WAIT2);
6094 sk->sk_shutdown |= SEND_SHUTDOWN; 5846 sk->sk_shutdown |= SEND_SHUTDOWN;
6095 5847 dst_confirm(__sk_dst_get(sk));
6096 dst = __sk_dst_get(sk);
6097 if (dst)
6098 dst_confirm(dst);
6099 5848
6100 if (!sock_flag(sk, SOCK_DEAD)) 5849 if (!sock_flag(sk, SOCK_DEAD))
6101 /* Wake up lingering close() */ 5850 /* Wake up lingering close() */
@@ -6145,12 +5894,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
6145 } 5894 }
6146 break; 5895 break;
6147 } 5896 }
6148 } 5897 } else
6149 5898 goto discard;
6150 /* ts_recent update must be made after we are sure that the packet
6151 * is in window.
6152 */
6153 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
6154 5899
6155 /* step 6: check the URG bit */ 5900 /* step 6: check the URG bit */
6156 tcp_urg(sk, skb, th); 5901 tcp_urg(sk, skb, th);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 54139fa514e..6cdf6a28f6b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -50,7 +50,6 @@
50 * a single port at the same time. 50 * a single port at the same time.
51 */ 51 */
52 52
53#define pr_fmt(fmt) "TCP: " fmt
54 53
55#include <linux/bottom_half.h> 54#include <linux/bottom_half.h>
56#include <linux/types.h> 55#include <linux/types.h>
@@ -74,7 +73,6 @@
74#include <net/xfrm.h> 73#include <net/xfrm.h>
75#include <net/netdma.h> 74#include <net/netdma.h>
76#include <net/secure_seq.h> 75#include <net/secure_seq.h>
77#include <net/tcp_memcontrol.h>
78 76
79#include <linux/inet.h> 77#include <linux/inet.h>
80#include <linux/ipv6.h> 78#include <linux/ipv6.h>
@@ -91,14 +89,22 @@ EXPORT_SYMBOL(sysctl_tcp_low_latency);
91 89
92 90
93#ifdef CONFIG_TCP_MD5SIG 91#ifdef CONFIG_TCP_MD5SIG
94static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 92static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
95 __be32 daddr, __be32 saddr, const struct tcphdr *th); 93 __be32 addr);
94static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
95 __be32 daddr, __be32 saddr, struct tcphdr *th);
96#else
97static inline
98struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
99{
100 return NULL;
101}
96#endif 102#endif
97 103
98struct inet_hashinfo tcp_hashinfo; 104struct inet_hashinfo tcp_hashinfo;
99EXPORT_SYMBOL(tcp_hashinfo); 105EXPORT_SYMBOL(tcp_hashinfo);
100 106
101static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb) 107static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
102{ 108{
103 return secure_tcp_sequence_number(ip_hdr(skb)->daddr, 109 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
104 ip_hdr(skb)->saddr, 110 ip_hdr(skb)->saddr,
@@ -196,13 +202,26 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
196 /* Reset inherited state */ 202 /* Reset inherited state */
197 tp->rx_opt.ts_recent = 0; 203 tp->rx_opt.ts_recent = 0;
198 tp->rx_opt.ts_recent_stamp = 0; 204 tp->rx_opt.ts_recent_stamp = 0;
199 if (likely(!tp->repair)) 205 tp->write_seq = 0;
200 tp->write_seq = 0;
201 } 206 }
202 207
203 if (tcp_death_row.sysctl_tw_recycle && 208 if (tcp_death_row.sysctl_tw_recycle &&
204 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) 209 !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
205 tcp_fetch_timewait_stamp(sk, &rt->dst); 210 struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
211 /*
212 * VJ's idea. We save last timestamp seen from
213 * the destination in peer table, when entering state
214 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
215 * when trying new connection.
216 */
217 if (peer) {
218 inet_peer_refcheck(peer);
219 if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
220 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
221 tp->rx_opt.ts_recent = peer->tcp_ts;
222 }
223 }
224 }
206 225
207 inet->inet_dport = usin->sin_port; 226 inet->inet_dport = usin->sin_port;
208 inet->inet_daddr = daddr; 227 inet->inet_daddr = daddr;
@@ -234,7 +253,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
234 sk->sk_gso_type = SKB_GSO_TCPV4; 253 sk->sk_gso_type = SKB_GSO_TCPV4;
235 sk_setup_caps(sk, &rt->dst); 254 sk_setup_caps(sk, &rt->dst);
236 255
237 if (!tp->write_seq && likely(!tp->repair)) 256 if (!tp->write_seq)
238 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, 257 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
239 inet->inet_daddr, 258 inet->inet_daddr,
240 inet->inet_sport, 259 inet->inet_sport,
@@ -243,7 +262,6 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
243 inet->inet_id = tp->write_seq ^ jiffies; 262 inet->inet_id = tp->write_seq ^ jiffies;
244 263
245 err = tcp_connect(sk); 264 err = tcp_connect(sk);
246
247 rt = NULL; 265 rt = NULL;
248 if (err) 266 if (err)
249 goto failure; 267 goto failure;
@@ -264,15 +282,12 @@ failure:
264EXPORT_SYMBOL(tcp_v4_connect); 282EXPORT_SYMBOL(tcp_v4_connect);
265 283
266/* 284/*
267 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. 285 * This routine does path mtu discovery as defined in RFC1191.
268 * It can be called through tcp_release_cb() if socket was owned by user
269 * at the time tcp_v4_err() was called to handle ICMP message.
270 */ 286 */
271static void tcp_v4_mtu_reduced(struct sock *sk) 287static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
272{ 288{
273 struct dst_entry *dst; 289 struct dst_entry *dst;
274 struct inet_sock *inet = inet_sk(sk); 290 struct inet_sock *inet = inet_sk(sk);
275 u32 mtu = tcp_sk(sk)->mtu_info;
276 291
277 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs 292 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
278 * send out by Linux are always <576bytes so they should go through 293 * send out by Linux are always <576bytes so they should go through
@@ -281,10 +296,17 @@ static void tcp_v4_mtu_reduced(struct sock *sk)
281 if (sk->sk_state == TCP_LISTEN) 296 if (sk->sk_state == TCP_LISTEN)
282 return; 297 return;
283 298
284 dst = inet_csk_update_pmtu(sk, mtu); 299 /* We don't check in the destentry if pmtu discovery is forbidden
285 if (!dst) 300 * on this route. We just assume that no packet_to_big packets
301 * are send back when pmtu discovery is not active.
302 * There is a small race when the user changes this flag in the
303 * route, but I think that's acceptable.
304 */
305 if ((dst = __sk_dst_check(sk, 0)) == NULL)
286 return; 306 return;
287 307
308 dst->ops->update_pmtu(dst, mtu);
309
288 /* Something is about to be wrong... Remember soft error 310 /* Something is about to be wrong... Remember soft error
289 * for the case, if this connection will not able to recover. 311 * for the case, if this connection will not able to recover.
290 */ 312 */
@@ -306,14 +328,6 @@ static void tcp_v4_mtu_reduced(struct sock *sk)
306 } /* else let the usual retransmit timer handle it */ 328 } /* else let the usual retransmit timer handle it */
307} 329}
308 330
309static void do_redirect(struct sk_buff *skb, struct sock *sk)
310{
311 struct dst_entry *dst = __sk_dst_check(sk, 0);
312
313 if (dst)
314 dst->ops->redirect(dst, sk, skb);
315}
316
317/* 331/*
318 * This routine is called by the ICMP module when it gets some 332 * This routine is called by the ICMP module when it gets some
319 * sort of error condition. If err < 0 then the socket should 333 * sort of error condition. If err < 0 then the socket should
@@ -341,7 +355,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
341 const int code = icmp_hdr(icmp_skb)->code; 355 const int code = icmp_hdr(icmp_skb)->code;
342 struct sock *sk; 356 struct sock *sk;
343 struct sk_buff *skb; 357 struct sk_buff *skb;
344 struct request_sock *req;
345 __u32 seq; 358 __u32 seq;
346 __u32 remaining; 359 __u32 remaining;
347 int err; 360 int err;
@@ -366,12 +379,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
366 bh_lock_sock(sk); 379 bh_lock_sock(sk);
367 /* If too many ICMPs get dropped on busy 380 /* If too many ICMPs get dropped on busy
368 * servers this needs to be solved differently. 381 * servers this needs to be solved differently.
369 * We do take care of PMTU discovery (RFC1191) special case :
370 * we can receive locally generated ICMP messages while socket is held.
371 */ 382 */
372 if (sock_owned_by_user(sk) && 383 if (sock_owned_by_user(sk))
373 type != ICMP_DEST_UNREACH &&
374 code != ICMP_FRAG_NEEDED)
375 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); 384 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
376 385
377 if (sk->sk_state == TCP_CLOSE) 386 if (sk->sk_state == TCP_CLOSE)
@@ -384,20 +393,14 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
384 393
385 icsk = inet_csk(sk); 394 icsk = inet_csk(sk);
386 tp = tcp_sk(sk); 395 tp = tcp_sk(sk);
387 req = tp->fastopen_rsk;
388 seq = ntohl(th->seq); 396 seq = ntohl(th->seq);
389 if (sk->sk_state != TCP_LISTEN && 397 if (sk->sk_state != TCP_LISTEN &&
390 !between(seq, tp->snd_una, tp->snd_nxt) && 398 !between(seq, tp->snd_una, tp->snd_nxt)) {
391 (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
392 /* For a Fast Open socket, allow seq to be snt_isn. */
393 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); 399 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
394 goto out; 400 goto out;
395 } 401 }
396 402
397 switch (type) { 403 switch (type) {
398 case ICMP_REDIRECT:
399 do_redirect(icmp_skb, sk);
400 goto out;
401 case ICMP_SOURCE_QUENCH: 404 case ICMP_SOURCE_QUENCH:
402 /* Just silently ignore these. */ 405 /* Just silently ignore these. */
403 goto out; 406 goto out;
@@ -409,13 +412,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
409 goto out; 412 goto out;
410 413
411 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 414 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
412 tp->mtu_info = info; 415 if (!sock_owned_by_user(sk))
413 if (!sock_owned_by_user(sk)) { 416 do_pmtu_discovery(sk, iph, info);
414 tcp_v4_mtu_reduced(sk);
415 } else {
416 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
417 sock_hold(sk);
418 }
419 goto out; 417 goto out;
420 } 418 }
421 419
@@ -428,8 +426,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
428 !icsk->icsk_backoff) 426 !icsk->icsk_backoff)
429 break; 427 break;
430 428
431 /* XXX (TFO) - revisit the following logic for TFO */
432
433 if (sock_owned_by_user(sk)) 429 if (sock_owned_by_user(sk))
434 break; 430 break;
435 431
@@ -461,14 +457,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
461 goto out; 457 goto out;
462 } 458 }
463 459
464 /* XXX (TFO) - if it's a TFO socket and has been accepted, rather
465 * than following the TCP_SYN_RECV case and closing the socket,
466 * we ignore the ICMP error and keep trying like a fully established
467 * socket. Is this the right thing to do?
468 */
469 if (req && req->sk == NULL)
470 goto out;
471
472 switch (sk->sk_state) { 460 switch (sk->sk_state) {
473 struct request_sock *req, **prev; 461 struct request_sock *req, **prev;
474 case TCP_LISTEN: 462 case TCP_LISTEN:
@@ -501,8 +489,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
501 489
502 case TCP_SYN_SENT: 490 case TCP_SYN_SENT:
503 case TCP_SYN_RECV: /* Cannot happen. 491 case TCP_SYN_RECV: /* Cannot happen.
504 It can f.e. if SYNs crossed, 492 It can f.e. if SYNs crossed.
505 or Fast Open.
506 */ 493 */
507 if (!sock_owned_by_user(sk)) { 494 if (!sock_owned_by_user(sk)) {
508 sk->sk_err = err; 495 sk->sk_err = err;
@@ -565,7 +552,7 @@ static void __tcp_v4_send_check(struct sk_buff *skb,
565/* This routine computes an IPv4 TCP checksum. */ 552/* This routine computes an IPv4 TCP checksum. */
566void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) 553void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
567{ 554{
568 const struct inet_sock *inet = inet_sk(sk); 555 struct inet_sock *inet = inet_sk(sk);
569 556
570 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); 557 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
571} 558}
@@ -603,7 +590,7 @@ int tcp_v4_gso_send_check(struct sk_buff *skb)
603 590
604static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) 591static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
605{ 592{
606 const struct tcphdr *th = tcp_hdr(skb); 593 struct tcphdr *th = tcp_hdr(skb);
607 struct { 594 struct {
608 struct tcphdr th; 595 struct tcphdr th;
609#ifdef CONFIG_TCP_MD5SIG 596#ifdef CONFIG_TCP_MD5SIG
@@ -613,10 +600,6 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
613 struct ip_reply_arg arg; 600 struct ip_reply_arg arg;
614#ifdef CONFIG_TCP_MD5SIG 601#ifdef CONFIG_TCP_MD5SIG
615 struct tcp_md5sig_key *key; 602 struct tcp_md5sig_key *key;
616 const __u8 *hash_location = NULL;
617 unsigned char newhash[16];
618 int genhash;
619 struct sock *sk1 = NULL;
620#endif 603#endif
621 struct net *net; 604 struct net *net;
622 605
@@ -647,36 +630,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
647 arg.iov[0].iov_len = sizeof(rep.th); 630 arg.iov[0].iov_len = sizeof(rep.th);
648 631
649#ifdef CONFIG_TCP_MD5SIG 632#ifdef CONFIG_TCP_MD5SIG
650 hash_location = tcp_parse_md5sig_option(th); 633 key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->saddr) : NULL;
651 if (!sk && hash_location) {
652 /*
653 * active side is lost. Try to find listening socket through
654 * source port, and then find md5 key through listening socket.
655 * we are not loose security here:
656 * Incoming packet is checked with md5 hash with finding key,
657 * no RST generated if md5 hash doesn't match.
658 */
659 sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
660 &tcp_hashinfo, ip_hdr(skb)->daddr,
661 ntohs(th->source), inet_iif(skb));
662 /* don't send rst if it can't find key */
663 if (!sk1)
664 return;
665 rcu_read_lock();
666 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
667 &ip_hdr(skb)->saddr, AF_INET);
668 if (!key)
669 goto release_sk1;
670
671 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
672 if (genhash || memcmp(hash_location, newhash, 16) != 0)
673 goto release_sk1;
674 } else {
675 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
676 &ip_hdr(skb)->saddr,
677 AF_INET) : NULL;
678 }
679
680 if (key) { 634 if (key) {
681 rep.opt[0] = htonl((TCPOPT_NOP << 24) | 635 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
682 (TCPOPT_NOP << 16) | 636 (TCPOPT_NOP << 16) |
@@ -696,28 +650,13 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
696 arg.iov[0].iov_len, IPPROTO_TCP, 0); 650 arg.iov[0].iov_len, IPPROTO_TCP, 0);
697 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 651 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
698 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; 652 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
699 /* When socket is gone, all binding information is lost.
700 * routing might fail in this case. No choice here, if we choose to force
701 * input interface, we will misroute in case of asymmetric route.
702 */
703 if (sk)
704 arg.bound_dev_if = sk->sk_bound_dev_if;
705 653
706 net = dev_net(skb_dst(skb)->dev); 654 net = dev_net(skb_dst(skb)->dev);
707 arg.tos = ip_hdr(skb)->tos; 655 ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
708 ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, 656 &arg, arg.iov[0].iov_len);
709 ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
710 657
711 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); 658 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
712 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); 659 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
713
714#ifdef CONFIG_TCP_MD5SIG
715release_sk1:
716 if (sk1) {
717 rcu_read_unlock();
718 sock_put(sk1);
719 }
720#endif
721} 660}
722 661
723/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states 662/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
@@ -727,9 +666,9 @@ release_sk1:
727static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, 666static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
728 u32 win, u32 ts, int oif, 667 u32 win, u32 ts, int oif,
729 struct tcp_md5sig_key *key, 668 struct tcp_md5sig_key *key,
730 int reply_flags, u8 tos) 669 int reply_flags)
731{ 670{
732 const struct tcphdr *th = tcp_hdr(skb); 671 struct tcphdr *th = tcp_hdr(skb);
733 struct { 672 struct {
734 struct tcphdr th; 673 struct tcphdr th;
735 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2) 674 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
@@ -787,9 +726,9 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
787 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 726 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
788 if (oif) 727 if (oif)
789 arg.bound_dev_if = oif; 728 arg.bound_dev_if = oif;
790 arg.tos = tos; 729
791 ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, 730 ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
792 ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); 731 &arg, arg.iov[0].iov_len);
793 732
794 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); 733 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
795} 734}
@@ -804,8 +743,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
804 tcptw->tw_ts_recent, 743 tcptw->tw_ts_recent,
805 tw->tw_bound_dev_if, 744 tw->tw_bound_dev_if,
806 tcp_twsk_md5_key(tcptw), 745 tcp_twsk_md5_key(tcptw),
807 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, 746 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
808 tw->tw_tos
809 ); 747 );
810 748
811 inet_twsk_put(tw); 749 inet_twsk_put(tw);
@@ -814,18 +752,12 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
814static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, 752static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
815 struct request_sock *req) 753 struct request_sock *req)
816{ 754{
817 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV 755 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
818 * sk->sk_state == TCP_SYN_RECV -> for Fast Open. 756 tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
819 */
820 tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
821 tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
822 tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
823 req->ts_recent, 757 req->ts_recent,
824 0, 758 0,
825 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, 759 tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
826 AF_INET), 760 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
827 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
828 ip_hdr(skb)->tos);
829} 761}
830 762
831/* 763/*
@@ -835,9 +767,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
835 */ 767 */
836static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, 768static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
837 struct request_sock *req, 769 struct request_sock *req,
838 struct request_values *rvp, 770 struct request_values *rvp)
839 u16 queue_mapping,
840 bool nocache)
841{ 771{
842 const struct inet_request_sock *ireq = inet_rsk(req); 772 const struct inet_request_sock *ireq = inet_rsk(req);
843 struct flowi4 fl4; 773 struct flowi4 fl4;
@@ -848,31 +778,26 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
848 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) 778 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
849 return -1; 779 return -1;
850 780
851 skb = tcp_make_synack(sk, dst, req, rvp, NULL); 781 skb = tcp_make_synack(sk, dst, req, rvp);
852 782
853 if (skb) { 783 if (skb) {
854 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr); 784 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
855 785
856 skb_set_queue_mapping(skb, queue_mapping);
857 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr, 786 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
858 ireq->rmt_addr, 787 ireq->rmt_addr,
859 ireq->opt); 788 ireq->opt);
860 err = net_xmit_eval(err); 789 err = net_xmit_eval(err);
861 if (!tcp_rsk(req)->snt_synack && !err)
862 tcp_rsk(req)->snt_synack = tcp_time_stamp;
863 } 790 }
864 791
792 dst_release(dst);
865 return err; 793 return err;
866} 794}
867 795
868static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req, 796static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
869 struct request_values *rvp) 797 struct request_values *rvp)
870{ 798{
871 int res = tcp_v4_send_synack(sk, NULL, req, rvp, 0, false); 799 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
872 800 return tcp_v4_send_synack(sk, NULL, req, rvp);
873 if (!res)
874 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
875 return res;
876} 801}
877 802
878/* 803/*
@@ -884,14 +809,14 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
884} 809}
885 810
886/* 811/*
887 * Return true if a syncookie should be sent 812 * Return 1 if a syncookie should be sent
888 */ 813 */
889bool tcp_syn_flood_action(struct sock *sk, 814int tcp_syn_flood_action(struct sock *sk,
890 const struct sk_buff *skb, 815 const struct sk_buff *skb,
891 const char *proto) 816 const char *proto)
892{ 817{
893 const char *msg = "Dropping request"; 818 const char *msg = "Dropping request";
894 bool want_cookie = false; 819 int want_cookie = 0;
895 struct listen_sock *lopt; 820 struct listen_sock *lopt;
896 821
897 822
@@ -899,7 +824,7 @@ bool tcp_syn_flood_action(struct sock *sk,
899#ifdef CONFIG_SYN_COOKIES 824#ifdef CONFIG_SYN_COOKIES
900 if (sysctl_tcp_syncookies) { 825 if (sysctl_tcp_syncookies) {
901 msg = "Sending cookies"; 826 msg = "Sending cookies";
902 want_cookie = true; 827 want_cookie = 1;
903 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); 828 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
904 } else 829 } else
905#endif 830#endif
@@ -908,7 +833,8 @@ bool tcp_syn_flood_action(struct sock *sk,
908 lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; 833 lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
909 if (!lopt->synflood_warned) { 834 if (!lopt->synflood_warned) {
910 lopt->synflood_warned = 1; 835 lopt->synflood_warned = 1;
911 pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", 836 pr_info("%s: Possible SYN flooding on port %d. %s. "
837 " Check SNMP counters.\n",
912 proto, ntohs(tcp_hdr(skb)->dest), msg); 838 proto, ntohs(tcp_hdr(skb)->dest), msg);
913 } 839 }
914 return want_cookie; 840 return want_cookie;
@@ -918,7 +844,8 @@ EXPORT_SYMBOL(tcp_syn_flood_action);
918/* 844/*
919 * Save and compile IPv4 options into the request_sock if needed. 845 * Save and compile IPv4 options into the request_sock if needed.
920 */ 846 */
921static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) 847static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
848 struct sk_buff *skb)
922{ 849{
923 const struct ip_options *opt = &(IPCB(skb)->opt); 850 const struct ip_options *opt = &(IPCB(skb)->opt);
924 struct ip_options_rcu *dopt = NULL; 851 struct ip_options_rcu *dopt = NULL;
@@ -945,138 +872,153 @@ static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
945 */ 872 */
946 873
947/* Find the Key structure for an address. */ 874/* Find the Key structure for an address. */
948struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, 875static struct tcp_md5sig_key *
949 const union tcp_md5_addr *addr, 876 tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
950 int family)
951{ 877{
952 struct tcp_sock *tp = tcp_sk(sk); 878 struct tcp_sock *tp = tcp_sk(sk);
953 struct tcp_md5sig_key *key; 879 int i;
954 struct hlist_node *pos; 880
955 unsigned int size = sizeof(struct in_addr); 881 if (!tp->md5sig_info || !tp->md5sig_info->entries4)
956 struct tcp_md5sig_info *md5sig;
957
958 /* caller either holds rcu_read_lock() or socket lock */
959 md5sig = rcu_dereference_check(tp->md5sig_info,
960 sock_owned_by_user(sk) ||
961 lockdep_is_held(&sk->sk_lock.slock));
962 if (!md5sig)
963 return NULL; 882 return NULL;
964#if IS_ENABLED(CONFIG_IPV6) 883 for (i = 0; i < tp->md5sig_info->entries4; i++) {
965 if (family == AF_INET6) 884 if (tp->md5sig_info->keys4[i].addr == addr)
966 size = sizeof(struct in6_addr); 885 return &tp->md5sig_info->keys4[i].base;
967#endif
968 hlist_for_each_entry_rcu(key, pos, &md5sig->head, node) {
969 if (key->family != family)
970 continue;
971 if (!memcmp(&key->addr, addr, size))
972 return key;
973 } 886 }
974 return NULL; 887 return NULL;
975} 888}
976EXPORT_SYMBOL(tcp_md5_do_lookup);
977 889
978struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk, 890struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
979 struct sock *addr_sk) 891 struct sock *addr_sk)
980{ 892{
981 union tcp_md5_addr *addr; 893 return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
982
983 addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
984 return tcp_md5_do_lookup(sk, addr, AF_INET);
985} 894}
986EXPORT_SYMBOL(tcp_v4_md5_lookup); 895EXPORT_SYMBOL(tcp_v4_md5_lookup);
987 896
988static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk, 897static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
989 struct request_sock *req) 898 struct request_sock *req)
990{ 899{
991 union tcp_md5_addr *addr; 900 return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
992
993 addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
994 return tcp_md5_do_lookup(sk, addr, AF_INET);
995} 901}
996 902
997/* This can be called on a newly created socket, from other files */ 903/* This can be called on a newly created socket, from other files */
998int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, 904int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
999 int family, const u8 *newkey, u8 newkeylen, gfp_t gfp) 905 u8 *newkey, u8 newkeylen)
1000{ 906{
1001 /* Add Key to the list */ 907 /* Add Key to the list */
1002 struct tcp_md5sig_key *key; 908 struct tcp_md5sig_key *key;
1003 struct tcp_sock *tp = tcp_sk(sk); 909 struct tcp_sock *tp = tcp_sk(sk);
1004 struct tcp_md5sig_info *md5sig; 910 struct tcp4_md5sig_key *keys;
1005 911
1006 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET); 912 key = tcp_v4_md5_do_lookup(sk, addr);
1007 if (key) { 913 if (key) {
1008 /* Pre-existing entry - just update that one. */ 914 /* Pre-existing entry - just update that one. */
1009 memcpy(key->key, newkey, newkeylen); 915 kfree(key->key);
916 key->key = newkey;
1010 key->keylen = newkeylen; 917 key->keylen = newkeylen;
1011 return 0; 918 } else {
1012 } 919 struct tcp_md5sig_info *md5sig;
920
921 if (!tp->md5sig_info) {
922 tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
923 GFP_ATOMIC);
924 if (!tp->md5sig_info) {
925 kfree(newkey);
926 return -ENOMEM;
927 }
928 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
929 }
1013 930
1014 md5sig = rcu_dereference_protected(tp->md5sig_info, 931 md5sig = tp->md5sig_info;
1015 sock_owned_by_user(sk)); 932 if (md5sig->entries4 == 0 &&
1016 if (!md5sig) { 933 tcp_alloc_md5sig_pool(sk) == NULL) {
1017 md5sig = kmalloc(sizeof(*md5sig), gfp); 934 kfree(newkey);
1018 if (!md5sig)
1019 return -ENOMEM; 935 return -ENOMEM;
936 }
1020 937
1021 sk_nocaps_add(sk, NETIF_F_GSO_MASK); 938 if (md5sig->alloced4 == md5sig->entries4) {
1022 INIT_HLIST_HEAD(&md5sig->head); 939 keys = kmalloc((sizeof(*keys) *
1023 rcu_assign_pointer(tp->md5sig_info, md5sig); 940 (md5sig->entries4 + 1)), GFP_ATOMIC);
1024 } 941 if (!keys) {
942 kfree(newkey);
943 if (md5sig->entries4 == 0)
944 tcp_free_md5sig_pool();
945 return -ENOMEM;
946 }
1025 947
1026 key = sock_kmalloc(sk, sizeof(*key), gfp); 948 if (md5sig->entries4)
1027 if (!key) 949 memcpy(keys, md5sig->keys4,
1028 return -ENOMEM; 950 sizeof(*keys) * md5sig->entries4);
1029 if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
1030 sock_kfree_s(sk, key, sizeof(*key));
1031 return -ENOMEM;
1032 }
1033 951
1034 memcpy(key->key, newkey, newkeylen); 952 /* Free old key list, and reference new one */
1035 key->keylen = newkeylen; 953 kfree(md5sig->keys4);
1036 key->family = family; 954 md5sig->keys4 = keys;
1037 memcpy(&key->addr, addr, 955 md5sig->alloced4++;
1038 (family == AF_INET6) ? sizeof(struct in6_addr) : 956 }
1039 sizeof(struct in_addr)); 957 md5sig->entries4++;
1040 hlist_add_head_rcu(&key->node, &md5sig->head); 958 md5sig->keys4[md5sig->entries4 - 1].addr = addr;
959 md5sig->keys4[md5sig->entries4 - 1].base.key = newkey;
960 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
961 }
1041 return 0; 962 return 0;
1042} 963}
1043EXPORT_SYMBOL(tcp_md5_do_add); 964EXPORT_SYMBOL(tcp_v4_md5_do_add);
1044 965
1045int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family) 966static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
967 u8 *newkey, u8 newkeylen)
1046{ 968{
1047 struct tcp_sock *tp = tcp_sk(sk); 969 return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
1048 struct tcp_md5sig_key *key; 970 newkey, newkeylen);
1049 struct tcp_md5sig_info *md5sig;
1050
1051 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
1052 if (!key)
1053 return -ENOENT;
1054 hlist_del_rcu(&key->node);
1055 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1056 kfree_rcu(key, rcu);
1057 md5sig = rcu_dereference_protected(tp->md5sig_info,
1058 sock_owned_by_user(sk));
1059 if (hlist_empty(&md5sig->head))
1060 tcp_free_md5sig_pool();
1061 return 0;
1062} 971}
1063EXPORT_SYMBOL(tcp_md5_do_del);
1064 972
1065static void tcp_clear_md5_list(struct sock *sk) 973int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
1066{ 974{
1067 struct tcp_sock *tp = tcp_sk(sk); 975 struct tcp_sock *tp = tcp_sk(sk);
1068 struct tcp_md5sig_key *key; 976 int i;
1069 struct hlist_node *pos, *n; 977
1070 struct tcp_md5sig_info *md5sig; 978 for (i = 0; i < tp->md5sig_info->entries4; i++) {
979 if (tp->md5sig_info->keys4[i].addr == addr) {
980 /* Free the key */
981 kfree(tp->md5sig_info->keys4[i].base.key);
982 tp->md5sig_info->entries4--;
983
984 if (tp->md5sig_info->entries4 == 0) {
985 kfree(tp->md5sig_info->keys4);
986 tp->md5sig_info->keys4 = NULL;
987 tp->md5sig_info->alloced4 = 0;
988 tcp_free_md5sig_pool();
989 } else if (tp->md5sig_info->entries4 != i) {
990 /* Need to do some manipulation */
991 memmove(&tp->md5sig_info->keys4[i],
992 &tp->md5sig_info->keys4[i+1],
993 (tp->md5sig_info->entries4 - i) *
994 sizeof(struct tcp4_md5sig_key));
995 }
996 return 0;
997 }
998 }
999 return -ENOENT;
1000}
1001EXPORT_SYMBOL(tcp_v4_md5_do_del);
1071 1002
1072 md5sig = rcu_dereference_protected(tp->md5sig_info, 1); 1003static void tcp_v4_clear_md5_list(struct sock *sk)
1004{
1005 struct tcp_sock *tp = tcp_sk(sk);
1073 1006
1074 if (!hlist_empty(&md5sig->head)) 1007 /* Free each key, then the set of key keys,
1008 * the crypto element, and then decrement our
1009 * hold on the last resort crypto.
1010 */
1011 if (tp->md5sig_info->entries4) {
1012 int i;
1013 for (i = 0; i < tp->md5sig_info->entries4; i++)
1014 kfree(tp->md5sig_info->keys4[i].base.key);
1015 tp->md5sig_info->entries4 = 0;
1075 tcp_free_md5sig_pool(); 1016 tcp_free_md5sig_pool();
1076 hlist_for_each_entry_safe(key, pos, n, &md5sig->head, node) { 1017 }
1077 hlist_del_rcu(&key->node); 1018 if (tp->md5sig_info->keys4) {
1078 atomic_sub(sizeof(*key), &sk->sk_omem_alloc); 1019 kfree(tp->md5sig_info->keys4);
1079 kfree_rcu(key, rcu); 1020 tp->md5sig_info->keys4 = NULL;
1021 tp->md5sig_info->alloced4 = 0;
1080 } 1022 }
1081} 1023}
1082 1024
@@ -1085,6 +1027,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1085{ 1027{
1086 struct tcp_md5sig cmd; 1028 struct tcp_md5sig cmd;
1087 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; 1029 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1030 u8 *newkey;
1088 1031
1089 if (optlen < sizeof(cmd)) 1032 if (optlen < sizeof(cmd))
1090 return -EINVAL; 1033 return -EINVAL;
@@ -1095,16 +1038,32 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1095 if (sin->sin_family != AF_INET) 1038 if (sin->sin_family != AF_INET)
1096 return -EINVAL; 1039 return -EINVAL;
1097 1040
1098 if (!cmd.tcpm_key || !cmd.tcpm_keylen) 1041 if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1099 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, 1042 if (!tcp_sk(sk)->md5sig_info)
1100 AF_INET); 1043 return -ENOENT;
1044 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1045 }
1101 1046
1102 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) 1047 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1103 return -EINVAL; 1048 return -EINVAL;
1104 1049
1105 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, 1050 if (!tcp_sk(sk)->md5sig_info) {
1106 AF_INET, cmd.tcpm_key, cmd.tcpm_keylen, 1051 struct tcp_sock *tp = tcp_sk(sk);
1107 GFP_KERNEL); 1052 struct tcp_md5sig_info *p;
1053
1054 p = kzalloc(sizeof(*p), sk->sk_allocation);
1055 if (!p)
1056 return -EINVAL;
1057
1058 tp->md5sig_info = p;
1059 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1060 }
1061
1062 newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1063 if (!newkey)
1064 return -ENOMEM;
1065 return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1066 newkey, cmd.tcpm_keylen);
1108} 1067}
1109 1068
1110static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp, 1069static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
@@ -1130,8 +1089,8 @@ static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1130 return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp)); 1089 return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1131} 1090}
1132 1091
1133static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, 1092static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1134 __be32 daddr, __be32 saddr, const struct tcphdr *th) 1093 __be32 daddr, __be32 saddr, struct tcphdr *th)
1135{ 1094{
1136 struct tcp_md5sig_pool *hp; 1095 struct tcp_md5sig_pool *hp;
1137 struct hash_desc *desc; 1096 struct hash_desc *desc;
@@ -1163,12 +1122,12 @@ clear_hash_noput:
1163} 1122}
1164 1123
1165int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key, 1124int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1166 const struct sock *sk, const struct request_sock *req, 1125 struct sock *sk, struct request_sock *req,
1167 const struct sk_buff *skb) 1126 struct sk_buff *skb)
1168{ 1127{
1169 struct tcp_md5sig_pool *hp; 1128 struct tcp_md5sig_pool *hp;
1170 struct hash_desc *desc; 1129 struct hash_desc *desc;
1171 const struct tcphdr *th = tcp_hdr(skb); 1130 struct tcphdr *th = tcp_hdr(skb);
1172 __be32 saddr, daddr; 1131 __be32 saddr, daddr;
1173 1132
1174 if (sk) { 1133 if (sk) {
@@ -1213,7 +1172,7 @@ clear_hash_noput:
1213} 1172}
1214EXPORT_SYMBOL(tcp_v4_md5_hash_skb); 1173EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1215 1174
1216static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) 1175static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1217{ 1176{
1218 /* 1177 /*
1219 * This gets called for each TCP segment that arrives 1178 * This gets called for each TCP segment that arrives
@@ -1223,29 +1182,28 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1223 * o MD5 hash and we're not expecting one. 1182 * o MD5 hash and we're not expecting one.
1224 * o MD5 hash and its wrong. 1183 * o MD5 hash and its wrong.
1225 */ 1184 */
1226 const __u8 *hash_location = NULL; 1185 __u8 *hash_location = NULL;
1227 struct tcp_md5sig_key *hash_expected; 1186 struct tcp_md5sig_key *hash_expected;
1228 const struct iphdr *iph = ip_hdr(skb); 1187 const struct iphdr *iph = ip_hdr(skb);
1229 const struct tcphdr *th = tcp_hdr(skb); 1188 struct tcphdr *th = tcp_hdr(skb);
1230 int genhash; 1189 int genhash;
1231 unsigned char newhash[16]; 1190 unsigned char newhash[16];
1232 1191
1233 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr, 1192 hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1234 AF_INET);
1235 hash_location = tcp_parse_md5sig_option(th); 1193 hash_location = tcp_parse_md5sig_option(th);
1236 1194
1237 /* We've parsed the options - do we have a hash? */ 1195 /* We've parsed the options - do we have a hash? */
1238 if (!hash_expected && !hash_location) 1196 if (!hash_expected && !hash_location)
1239 return false; 1197 return 0;
1240 1198
1241 if (hash_expected && !hash_location) { 1199 if (hash_expected && !hash_location) {
1242 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); 1200 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1243 return true; 1201 return 1;
1244 } 1202 }
1245 1203
1246 if (!hash_expected && hash_location) { 1204 if (!hash_expected && hash_location) {
1247 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); 1205 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1248 return true; 1206 return 1;
1249 } 1207 }
1250 1208
1251 /* Okay, so this is hash_expected and hash_location - 1209 /* Okay, so this is hash_expected and hash_location -
@@ -1256,14 +1214,15 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1256 NULL, NULL, skb); 1214 NULL, NULL, skb);
1257 1215
1258 if (genhash || memcmp(hash_location, newhash, 16) != 0) { 1216 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1259 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", 1217 if (net_ratelimit()) {
1260 &iph->saddr, ntohs(th->source), 1218 printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1261 &iph->daddr, ntohs(th->dest), 1219 &iph->saddr, ntohs(th->source),
1262 genhash ? " tcp_v4_calc_md5_hash failed" 1220 &iph->daddr, ntohs(th->dest),
1263 : ""); 1221 genhash ? " tcp_v4_calc_md5_hash failed" : "");
1264 return true; 1222 }
1223 return 1;
1265 } 1224 }
1266 return false; 1225 return 0;
1267} 1226}
1268 1227
1269#endif 1228#endif
@@ -1285,189 +1244,11 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1285}; 1244};
1286#endif 1245#endif
1287 1246
1288static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
1289 struct request_sock *req,
1290 struct tcp_fastopen_cookie *foc,
1291 struct tcp_fastopen_cookie *valid_foc)
1292{
1293 bool skip_cookie = false;
1294 struct fastopen_queue *fastopenq;
1295
1296 if (likely(!fastopen_cookie_present(foc))) {
1297 /* See include/net/tcp.h for the meaning of these knobs */
1298 if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
1299 ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
1300 (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
1301 skip_cookie = true; /* no cookie to validate */
1302 else
1303 return false;
1304 }
1305 fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
1306 /* A FO option is present; bump the counter. */
1307 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
1308
1309 /* Make sure the listener has enabled fastopen, and we don't
1310 * exceed the max # of pending TFO requests allowed before trying
1311 * to validating the cookie in order to avoid burning CPU cycles
1312 * unnecessarily.
1313 *
1314 * XXX (TFO) - The implication of checking the max_qlen before
1315 * processing a cookie request is that clients can't differentiate
1316 * between qlen overflow causing Fast Open to be disabled
1317 * temporarily vs a server not supporting Fast Open at all.
1318 */
1319 if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
1320 fastopenq == NULL || fastopenq->max_qlen == 0)
1321 return false;
1322
1323 if (fastopenq->qlen >= fastopenq->max_qlen) {
1324 struct request_sock *req1;
1325 spin_lock(&fastopenq->lock);
1326 req1 = fastopenq->rskq_rst_head;
1327 if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
1328 spin_unlock(&fastopenq->lock);
1329 NET_INC_STATS_BH(sock_net(sk),
1330 LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
1331 /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
1332 foc->len = -1;
1333 return false;
1334 }
1335 fastopenq->rskq_rst_head = req1->dl_next;
1336 fastopenq->qlen--;
1337 spin_unlock(&fastopenq->lock);
1338 reqsk_free(req1);
1339 }
1340 if (skip_cookie) {
1341 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1342 return true;
1343 }
1344 if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
1345 if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
1346 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1347 if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
1348 memcmp(&foc->val[0], &valid_foc->val[0],
1349 TCP_FASTOPEN_COOKIE_SIZE) != 0)
1350 return false;
1351 valid_foc->len = -1;
1352 }
1353 /* Acknowledge the data received from the peer. */
1354 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1355 return true;
1356 } else if (foc->len == 0) { /* Client requesting a cookie */
1357 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1358 NET_INC_STATS_BH(sock_net(sk),
1359 LINUX_MIB_TCPFASTOPENCOOKIEREQD);
1360 } else {
1361 /* Client sent a cookie with wrong size. Treat it
1362 * the same as invalid and return a valid one.
1363 */
1364 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1365 }
1366 return false;
1367}
1368
1369static int tcp_v4_conn_req_fastopen(struct sock *sk,
1370 struct sk_buff *skb,
1371 struct sk_buff *skb_synack,
1372 struct request_sock *req,
1373 struct request_values *rvp)
1374{
1375 struct tcp_sock *tp = tcp_sk(sk);
1376 struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
1377 const struct inet_request_sock *ireq = inet_rsk(req);
1378 struct sock *child;
1379 int err;
1380
1381 req->num_retrans = 0;
1382 req->num_timeout = 0;
1383 req->sk = NULL;
1384
1385 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
1386 if (child == NULL) {
1387 NET_INC_STATS_BH(sock_net(sk),
1388 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1389 kfree_skb(skb_synack);
1390 return -1;
1391 }
1392 err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1393 ireq->rmt_addr, ireq->opt);
1394 err = net_xmit_eval(err);
1395 if (!err)
1396 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1397 /* XXX (TFO) - is it ok to ignore error and continue? */
1398
1399 spin_lock(&queue->fastopenq->lock);
1400 queue->fastopenq->qlen++;
1401 spin_unlock(&queue->fastopenq->lock);
1402
1403 /* Initialize the child socket. Have to fix some values to take
1404 * into account the child is a Fast Open socket and is created
1405 * only out of the bits carried in the SYN packet.
1406 */
1407 tp = tcp_sk(child);
1408
1409 tp->fastopen_rsk = req;
1410 /* Do a hold on the listner sk so that if the listener is being
1411 * closed, the child that has been accepted can live on and still
1412 * access listen_lock.
1413 */
1414 sock_hold(sk);
1415 tcp_rsk(req)->listener = sk;
1416
1417 /* RFC1323: The window in SYN & SYN/ACK segments is never
1418 * scaled. So correct it appropriately.
1419 */
1420 tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
1421
1422 /* Activate the retrans timer so that SYNACK can be retransmitted.
1423 * The request socket is not added to the SYN table of the parent
1424 * because it's been added to the accept queue directly.
1425 */
1426 inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
1427 TCP_TIMEOUT_INIT, TCP_RTO_MAX);
1428
1429 /* Add the child socket directly into the accept queue */
1430 inet_csk_reqsk_queue_add(sk, req, child);
1431
1432 /* Now finish processing the fastopen child socket. */
1433 inet_csk(child)->icsk_af_ops->rebuild_header(child);
1434 tcp_init_congestion_control(child);
1435 tcp_mtup_init(child);
1436 tcp_init_buffer_space(child);
1437 tcp_init_metrics(child);
1438
1439 /* Queue the data carried in the SYN packet. We need to first
1440 * bump skb's refcnt because the caller will attempt to free it.
1441 *
1442 * XXX (TFO) - we honor a zero-payload TFO request for now.
1443 * (Any reason not to?)
1444 */
1445 if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
1446 /* Don't queue the skb if there is no payload in SYN.
1447 * XXX (TFO) - How about SYN+FIN?
1448 */
1449 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1450 } else {
1451 skb = skb_get(skb);
1452 skb_dst_drop(skb);
1453 __skb_pull(skb, tcp_hdr(skb)->doff * 4);
1454 skb_set_owner_r(skb, child);
1455 __skb_queue_tail(&child->sk_receive_queue, skb);
1456 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1457 tp->syn_data_acked = 1;
1458 }
1459 sk->sk_data_ready(sk, 0);
1460 bh_unlock_sock(child);
1461 sock_put(child);
1462 WARN_ON(req->sk == NULL);
1463 return 0;
1464}
1465
1466int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) 1247int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1467{ 1248{
1468 struct tcp_extend_values tmp_ext; 1249 struct tcp_extend_values tmp_ext;
1469 struct tcp_options_received tmp_opt; 1250 struct tcp_options_received tmp_opt;
1470 const u8 *hash_location; 1251 u8 *hash_location;
1471 struct request_sock *req; 1252 struct request_sock *req;
1472 struct inet_request_sock *ireq; 1253 struct inet_request_sock *ireq;
1473 struct tcp_sock *tp = tcp_sk(sk); 1254 struct tcp_sock *tp = tcp_sk(sk);
@@ -1475,12 +1256,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1475 __be32 saddr = ip_hdr(skb)->saddr; 1256 __be32 saddr = ip_hdr(skb)->saddr;
1476 __be32 daddr = ip_hdr(skb)->daddr; 1257 __be32 daddr = ip_hdr(skb)->daddr;
1477 __u32 isn = TCP_SKB_CB(skb)->when; 1258 __u32 isn = TCP_SKB_CB(skb)->when;
1478 bool want_cookie = false; 1259 int want_cookie = 0;
1479 struct flowi4 fl4;
1480 struct tcp_fastopen_cookie foc = { .len = -1 };
1481 struct tcp_fastopen_cookie valid_foc = { .len = -1 };
1482 struct sk_buff *skb_synack;
1483 int do_fastopen;
1484 1260
1485 /* Never answer to SYNs send to broadcast or multicast */ 1261 /* Never answer to SYNs send to broadcast or multicast */
1486 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 1262 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
@@ -1515,8 +1291,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1515 tcp_clear_options(&tmp_opt); 1291 tcp_clear_options(&tmp_opt);
1516 tmp_opt.mss_clamp = TCP_MSS_DEFAULT; 1292 tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1517 tmp_opt.user_mss = tp->rx_opt.user_mss; 1293 tmp_opt.user_mss = tp->rx_opt.user_mss;
1518 tcp_parse_options(skb, &tmp_opt, &hash_location, 0, 1294 tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1519 want_cookie ? NULL : &foc);
1520 1295
1521 if (tmp_opt.cookie_plus > 0 && 1296 if (tmp_opt.cookie_plus > 0 &&
1522 tmp_opt.saw_tstamp && 1297 tmp_opt.saw_tstamp &&
@@ -1540,7 +1315,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1540 while (l-- > 0) 1315 while (l-- > 0)
1541 *c++ ^= *hash_location++; 1316 *c++ ^= *hash_location++;
1542 1317
1543 want_cookie = false; /* not our kind of cookie */ 1318 want_cookie = 0; /* not our kind of cookie */
1544 tmp_ext.cookie_out_never = 0; /* false */ 1319 tmp_ext.cookie_out_never = 0; /* false */
1545 tmp_ext.cookie_plus = tmp_opt.cookie_plus; 1320 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1546 } else if (!tp->rx_opt.cookie_in_always) { 1321 } else if (!tp->rx_opt.cookie_in_always) {
@@ -1562,18 +1337,21 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1562 ireq->loc_addr = daddr; 1337 ireq->loc_addr = daddr;
1563 ireq->rmt_addr = saddr; 1338 ireq->rmt_addr = saddr;
1564 ireq->no_srccheck = inet_sk(sk)->transparent; 1339 ireq->no_srccheck = inet_sk(sk)->transparent;
1565 ireq->opt = tcp_v4_save_options(skb); 1340 ireq->opt = tcp_v4_save_options(sk, skb);
1566 1341
1567 if (security_inet_conn_request(sk, skb, req)) 1342 if (security_inet_conn_request(sk, skb, req))
1568 goto drop_and_free; 1343 goto drop_and_free;
1569 1344
1570 if (!want_cookie || tmp_opt.tstamp_ok) 1345 if (!want_cookie || tmp_opt.tstamp_ok)
1571 TCP_ECN_create_request(req, skb); 1346 TCP_ECN_create_request(req, tcp_hdr(skb));
1572 1347
1573 if (want_cookie) { 1348 if (want_cookie) {
1574 isn = cookie_v4_init_sequence(sk, skb, &req->mss); 1349 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1575 req->cookie_ts = tmp_opt.tstamp_ok; 1350 req->cookie_ts = tmp_opt.tstamp_ok;
1576 } else if (!isn) { 1351 } else if (!isn) {
1352 struct inet_peer *peer = NULL;
1353 struct flowi4 fl4;
1354
1577 /* VJ's idea. We save last timestamp seen 1355 /* VJ's idea. We save last timestamp seen
1578 * from the destination in peer table, when entering 1356 * from the destination in peer table, when entering
1579 * state TIME-WAIT, and check against it before 1357 * state TIME-WAIT, and check against it before
@@ -1586,8 +1364,12 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1586 if (tmp_opt.saw_tstamp && 1364 if (tmp_opt.saw_tstamp &&
1587 tcp_death_row.sysctl_tw_recycle && 1365 tcp_death_row.sysctl_tw_recycle &&
1588 (dst = inet_csk_route_req(sk, &fl4, req)) != NULL && 1366 (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1589 fl4.daddr == saddr) { 1367 fl4.daddr == saddr &&
1590 if (!tcp_peer_is_proven(req, dst, true)) { 1368 (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
1369 inet_peer_refcheck(peer);
1370 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1371 (s32)(peer->tcp_ts - req->ts_recent) >
1372 TCP_PAWS_WINDOW) {
1591 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); 1373 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1592 goto drop_and_release; 1374 goto drop_and_release;
1593 } 1375 }
@@ -1596,7 +1378,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1596 else if (!sysctl_tcp_syncookies && 1378 else if (!sysctl_tcp_syncookies &&
1597 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < 1379 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1598 (sysctl_max_syn_backlog >> 2)) && 1380 (sysctl_max_syn_backlog >> 2)) &&
1599 !tcp_peer_is_proven(req, dst, false)) { 1381 (!peer || !peer->tcp_ts_stamp) &&
1382 (!dst || !dst_metric(dst, RTAX_RTT))) {
1600 /* Without syncookies last quarter of 1383 /* Without syncookies last quarter of
1601 * backlog is filled with destinations, 1384 * backlog is filled with destinations,
1602 * proven to be alive. 1385 * proven to be alive.
@@ -1604,7 +1387,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1604 * to destinations, already remembered 1387 * to destinations, already remembered
1605 * to the moment of synflood. 1388 * to the moment of synflood.
1606 */ 1389 */
1607 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"), 1390 LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1608 &saddr, ntohs(tcp_hdr(skb)->source)); 1391 &saddr, ntohs(tcp_hdr(skb)->source));
1609 goto drop_and_release; 1392 goto drop_and_release;
1610 } 1393 }
@@ -1612,54 +1395,14 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1612 isn = tcp_v4_init_sequence(skb); 1395 isn = tcp_v4_init_sequence(skb);
1613 } 1396 }
1614 tcp_rsk(req)->snt_isn = isn; 1397 tcp_rsk(req)->snt_isn = isn;
1398 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1615 1399
1616 if (dst == NULL) { 1400 if (tcp_v4_send_synack(sk, dst, req,
1617 dst = inet_csk_route_req(sk, &fl4, req); 1401 (struct request_values *)&tmp_ext) ||
1618 if (dst == NULL) 1402 want_cookie)
1619 goto drop_and_free;
1620 }
1621 do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
1622
1623 /* We don't call tcp_v4_send_synack() directly because we need
1624 * to make sure a child socket can be created successfully before
1625 * sending back synack!
1626 *
1627 * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
1628 * (or better yet, call tcp_send_synack() in the child context
1629 * directly, but will have to fix bunch of other code first)
1630 * after syn_recv_sock() except one will need to first fix the
1631 * latter to remove its dependency on the current implementation
1632 * of tcp_v4_send_synack()->tcp_select_initial_window().
1633 */
1634 skb_synack = tcp_make_synack(sk, dst, req,
1635 (struct request_values *)&tmp_ext,
1636 fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
1637
1638 if (skb_synack) {
1639 __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
1640 skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
1641 } else
1642 goto drop_and_free;
1643
1644 if (likely(!do_fastopen)) {
1645 int err;
1646 err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1647 ireq->rmt_addr, ireq->opt);
1648 err = net_xmit_eval(err);
1649 if (err || want_cookie)
1650 goto drop_and_free;
1651
1652 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1653 tcp_rsk(req)->listener = NULL;
1654 /* Add the request_sock to the SYN table */
1655 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1656 if (fastopen_cookie_present(&foc) && foc.len != 0)
1657 NET_INC_STATS_BH(sock_net(sk),
1658 LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1659 } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req,
1660 (struct request_values *)&tmp_ext))
1661 goto drop_and_free; 1403 goto drop_and_free;
1662 1404
1405 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1663 return 0; 1406 return 0;
1664 1407
1665drop_and_release: 1408drop_and_release:
@@ -1697,7 +1440,6 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1697 goto exit_nonewsk; 1440 goto exit_nonewsk;
1698 1441
1699 newsk->sk_gso_type = SKB_GSO_TCPV4; 1442 newsk->sk_gso_type = SKB_GSO_TCPV4;
1700 inet_sk_rx_dst_set(newsk, skb);
1701 1443
1702 newtp = tcp_sk(newsk); 1444 newtp = tcp_sk(newsk);
1703 newinet = inet_sk(newsk); 1445 newinet = inet_sk(newsk);
@@ -1710,19 +1452,14 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1710 ireq->opt = NULL; 1452 ireq->opt = NULL;
1711 newinet->mc_index = inet_iif(skb); 1453 newinet->mc_index = inet_iif(skb);
1712 newinet->mc_ttl = ip_hdr(skb)->ttl; 1454 newinet->mc_ttl = ip_hdr(skb)->ttl;
1713 newinet->rcv_tos = ip_hdr(skb)->tos;
1714 inet_csk(newsk)->icsk_ext_hdr_len = 0; 1455 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1715 if (inet_opt) 1456 if (inet_opt)
1716 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 1457 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1717 newinet->inet_id = newtp->write_seq ^ jiffies; 1458 newinet->inet_id = newtp->write_seq ^ jiffies;
1718 1459
1719 if (!dst) { 1460 if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
1720 dst = inet_csk_route_child_sock(sk, newsk, req); 1461 goto put_and_exit;
1721 if (!dst) 1462
1722 goto put_and_exit;
1723 } else {
1724 /* syncookie case : see end of cookie_v4_check() */
1725 }
1726 sk_setup_caps(newsk, dst); 1463 sk_setup_caps(newsk, dst);
1727 1464
1728 tcp_mtup_init(newsk); 1465 tcp_mtup_init(newsk);
@@ -1733,13 +1470,14 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1733 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; 1470 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1734 1471
1735 tcp_initialize_rcv_mss(newsk); 1472 tcp_initialize_rcv_mss(newsk);
1736 tcp_synack_rtt_meas(newsk, req); 1473 if (tcp_rsk(req)->snt_synack)
1737 newtp->total_retrans = req->num_retrans; 1474 tcp_valid_rtt_meas(newsk,
1475 tcp_time_stamp - tcp_rsk(req)->snt_synack);
1476 newtp->total_retrans = req->retrans;
1738 1477
1739#ifdef CONFIG_TCP_MD5SIG 1478#ifdef CONFIG_TCP_MD5SIG
1740 /* Copy over the MD5 key from the original socket */ 1479 /* Copy over the MD5 key from the original socket */
1741 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr, 1480 key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1742 AF_INET);
1743 if (key != NULL) { 1481 if (key != NULL) {
1744 /* 1482 /*
1745 * We're using one, so create a matching key 1483 * We're using one, so create a matching key
@@ -1747,8 +1485,10 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1747 * memory, then we end up not copying the key 1485 * memory, then we end up not copying the key
1748 * across. Shucks. 1486 * across. Shucks.
1749 */ 1487 */
1750 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr, 1488 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1751 AF_INET, key->key, key->keylen, GFP_ATOMIC); 1489 if (newkey != NULL)
1490 tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1491 newkey, key->keylen);
1752 sk_nocaps_add(newsk, NETIF_F_GSO_MASK); 1492 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1753 } 1493 }
1754#endif 1494#endif
@@ -1767,8 +1507,7 @@ exit:
1767 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); 1507 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1768 return NULL; 1508 return NULL;
1769put_and_exit: 1509put_and_exit:
1770 inet_csk_prepare_forced_close(newsk); 1510 sock_put(newsk);
1771 tcp_done(newsk);
1772 goto exit; 1511 goto exit;
1773} 1512}
1774EXPORT_SYMBOL(tcp_v4_syn_recv_sock); 1513EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
@@ -1783,7 +1522,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1783 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, 1522 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1784 iph->saddr, iph->daddr); 1523 iph->saddr, iph->daddr);
1785 if (req) 1524 if (req)
1786 return tcp_check_req(sk, skb, req, prev, false); 1525 return tcp_check_req(sk, skb, req, prev);
1787 1526
1788 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, 1527 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1789 th->source, iph->daddr, th->dest, inet_iif(skb)); 1528 th->source, iph->daddr, th->dest, inet_iif(skb));
@@ -1849,16 +1588,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1849#endif 1588#endif
1850 1589
1851 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1590 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1852 struct dst_entry *dst = sk->sk_rx_dst; 1591 sock_rps_save_rxhash(sk, skb->rxhash);
1853
1854 sock_rps_save_rxhash(sk, skb);
1855 if (dst) {
1856 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1857 dst->ops->check(dst, 0) == NULL) {
1858 dst_release(dst);
1859 sk->sk_rx_dst = NULL;
1860 }
1861 }
1862 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { 1592 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1863 rsk = sk; 1593 rsk = sk;
1864 goto reset; 1594 goto reset;
@@ -1875,7 +1605,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1875 goto discard; 1605 goto discard;
1876 1606
1877 if (nsk != sk) { 1607 if (nsk != sk) {
1878 sock_rps_save_rxhash(nsk, skb); 1608 sock_rps_save_rxhash(nsk, skb->rxhash);
1879 if (tcp_child_process(sk, nsk, skb)) { 1609 if (tcp_child_process(sk, nsk, skb)) {
1880 rsk = nsk; 1610 rsk = nsk;
1881 goto reset; 1611 goto reset;
@@ -1883,7 +1613,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1883 return 0; 1613 return 0;
1884 } 1614 }
1885 } else 1615 } else
1886 sock_rps_save_rxhash(sk, skb); 1616 sock_rps_save_rxhash(sk, skb->rxhash);
1887 1617
1888 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { 1618 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1889 rsk = sk; 1619 rsk = sk;
@@ -1908,43 +1638,6 @@ csum_err:
1908} 1638}
1909EXPORT_SYMBOL(tcp_v4_do_rcv); 1639EXPORT_SYMBOL(tcp_v4_do_rcv);
1910 1640
1911void tcp_v4_early_demux(struct sk_buff *skb)
1912{
1913 const struct iphdr *iph;
1914 const struct tcphdr *th;
1915 struct sock *sk;
1916
1917 if (skb->pkt_type != PACKET_HOST)
1918 return;
1919
1920 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1921 return;
1922
1923 iph = ip_hdr(skb);
1924 th = tcp_hdr(skb);
1925
1926 if (th->doff < sizeof(struct tcphdr) / 4)
1927 return;
1928
1929 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1930 iph->saddr, th->source,
1931 iph->daddr, ntohs(th->dest),
1932 skb->skb_iif);
1933 if (sk) {
1934 skb->sk = sk;
1935 skb->destructor = sock_edemux;
1936 if (sk->sk_state != TCP_TIME_WAIT) {
1937 struct dst_entry *dst = sk->sk_rx_dst;
1938
1939 if (dst)
1940 dst = dst_check(dst, 0);
1941 if (dst &&
1942 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1943 skb_dst_set_noref(skb, dst);
1944 }
1945 }
1946}
1947
1948/* 1641/*
1949 * From tcp_input.c 1642 * From tcp_input.c
1950 */ 1643 */
@@ -1952,7 +1645,7 @@ void tcp_v4_early_demux(struct sk_buff *skb)
1952int tcp_v4_rcv(struct sk_buff *skb) 1645int tcp_v4_rcv(struct sk_buff *skb)
1953{ 1646{
1954 const struct iphdr *iph; 1647 const struct iphdr *iph;
1955 const struct tcphdr *th; 1648 struct tcphdr *th;
1956 struct sock *sk; 1649 struct sock *sk;
1957 int ret; 1650 int ret;
1958 struct net *net = dev_net(skb->dev); 1651 struct net *net = dev_net(skb->dev);
@@ -1987,7 +1680,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
1987 skb->len - th->doff * 4); 1680 skb->len - th->doff * 4);
1988 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); 1681 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1989 TCP_SKB_CB(skb)->when = 0; 1682 TCP_SKB_CB(skb)->when = 0;
1990 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); 1683 TCP_SKB_CB(skb)->flags = iph->tos;
1991 TCP_SKB_CB(skb)->sacked = 0; 1684 TCP_SKB_CB(skb)->sacked = 0;
1992 1685
1993 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); 1686 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
@@ -2018,7 +1711,7 @@ process:
2018#ifdef CONFIG_NET_DMA 1711#ifdef CONFIG_NET_DMA
2019 struct tcp_sock *tp = tcp_sk(sk); 1712 struct tcp_sock *tp = tcp_sk(sk);
2020 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) 1713 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
2021 tp->ucopy.dma_chan = net_dma_find_channel(); 1714 tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
2022 if (tp->ucopy.dma_chan) 1715 if (tp->ucopy.dma_chan)
2023 ret = tcp_v4_do_rcv(sk, skb); 1716 ret = tcp_v4_do_rcv(sk, skb);
2024 else 1717 else
@@ -2027,8 +1720,7 @@ process:
2027 if (!tcp_prequeue(sk, skb)) 1720 if (!tcp_prequeue(sk, skb))
2028 ret = tcp_v4_do_rcv(sk, skb); 1721 ret = tcp_v4_do_rcv(sk, skb);
2029 } 1722 }
2030 } else if (unlikely(sk_add_backlog(sk, skb, 1723 } else if (unlikely(sk_add_backlog(sk, skb))) {
2031 sk->sk_rcvbuf + sk->sk_sndbuf))) {
2032 bh_unlock_sock(sk); 1724 bh_unlock_sock(sk);
2033 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); 1725 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
2034 goto discard_and_relse; 1726 goto discard_and_relse;
@@ -2094,29 +1786,49 @@ do_time_wait:
2094 goto discard_it; 1786 goto discard_it;
2095} 1787}
2096 1788
1789struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1790{
1791 struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1792 struct inet_sock *inet = inet_sk(sk);
1793 struct inet_peer *peer;
1794
1795 if (!rt ||
1796 inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
1797 peer = inet_getpeer_v4(inet->inet_daddr, 1);
1798 *release_it = true;
1799 } else {
1800 if (!rt->peer)
1801 rt_bind_peer(rt, inet->inet_daddr, 1);
1802 peer = rt->peer;
1803 *release_it = false;
1804 }
1805
1806 return peer;
1807}
1808EXPORT_SYMBOL(tcp_v4_get_peer);
1809
1810void *tcp_v4_tw_get_peer(struct sock *sk)
1811{
1812 struct inet_timewait_sock *tw = inet_twsk(sk);
1813
1814 return inet_getpeer_v4(tw->tw_daddr, 1);
1815}
1816EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1817
2097static struct timewait_sock_ops tcp_timewait_sock_ops = { 1818static struct timewait_sock_ops tcp_timewait_sock_ops = {
2098 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 1819 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2099 .twsk_unique = tcp_twsk_unique, 1820 .twsk_unique = tcp_twsk_unique,
2100 .twsk_destructor= tcp_twsk_destructor, 1821 .twsk_destructor= tcp_twsk_destructor,
1822 .twsk_getpeer = tcp_v4_tw_get_peer,
2101}; 1823};
2102 1824
2103void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2104{
2105 struct dst_entry *dst = skb_dst(skb);
2106
2107 dst_hold(dst);
2108 sk->sk_rx_dst = dst;
2109 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2110}
2111EXPORT_SYMBOL(inet_sk_rx_dst_set);
2112
2113const struct inet_connection_sock_af_ops ipv4_specific = { 1825const struct inet_connection_sock_af_ops ipv4_specific = {
2114 .queue_xmit = ip_queue_xmit, 1826 .queue_xmit = ip_queue_xmit,
2115 .send_check = tcp_v4_send_check, 1827 .send_check = tcp_v4_send_check,
2116 .rebuild_header = inet_sk_rebuild_header, 1828 .rebuild_header = inet_sk_rebuild_header,
2117 .sk_rx_dst_set = inet_sk_rx_dst_set,
2118 .conn_request = tcp_v4_conn_request, 1829 .conn_request = tcp_v4_conn_request,
2119 .syn_recv_sock = tcp_v4_syn_recv_sock, 1830 .syn_recv_sock = tcp_v4_syn_recv_sock,
1831 .get_peer = tcp_v4_get_peer,
2120 .net_header_len = sizeof(struct iphdr), 1832 .net_header_len = sizeof(struct iphdr),
2121 .setsockopt = ip_setsockopt, 1833 .setsockopt = ip_setsockopt,
2122 .getsockopt = ip_getsockopt, 1834 .getsockopt = ip_getsockopt,
@@ -2134,6 +1846,7 @@ EXPORT_SYMBOL(ipv4_specific);
2134static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { 1846static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2135 .md5_lookup = tcp_v4_md5_lookup, 1847 .md5_lookup = tcp_v4_md5_lookup,
2136 .calc_md5_hash = tcp_v4_md5_hash_skb, 1848 .calc_md5_hash = tcp_v4_md5_hash_skb,
1849 .md5_add = tcp_v4_md5_add_func,
2137 .md5_parse = tcp_v4_parse_md5_keys, 1850 .md5_parse = tcp_v4_parse_md5_keys,
2138}; 1851};
2139#endif 1852#endif
@@ -2144,15 +1857,63 @@ static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2144static int tcp_v4_init_sock(struct sock *sk) 1857static int tcp_v4_init_sock(struct sock *sk)
2145{ 1858{
2146 struct inet_connection_sock *icsk = inet_csk(sk); 1859 struct inet_connection_sock *icsk = inet_csk(sk);
1860 struct tcp_sock *tp = tcp_sk(sk);
2147 1861
2148 tcp_init_sock(sk); 1862 skb_queue_head_init(&tp->out_of_order_queue);
1863 tcp_init_xmit_timers(sk);
1864 tcp_prequeue_init(tp);
2149 1865
2150 icsk->icsk_af_ops = &ipv4_specific; 1866 icsk->icsk_rto = TCP_TIMEOUT_INIT;
1867 tp->mdev = TCP_TIMEOUT_INIT;
1868
1869 /* So many TCP implementations out there (incorrectly) count the
1870 * initial SYN frame in their delayed-ACK and congestion control
1871 * algorithms that we must have the following bandaid to talk
1872 * efficiently to them. -DaveM
1873 */
1874 tp->snd_cwnd = TCP_INIT_CWND;
1875
1876 /* See draft-stevens-tcpca-spec-01 for discussion of the
1877 * initialization of these values.
1878 */
1879 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1880 tp->snd_cwnd_clamp = ~0;
1881 tp->mss_cache = TCP_MSS_DEFAULT;
1882
1883 tp->reordering = sysctl_tcp_reordering;
1884 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
2151 1885
1886 sk->sk_state = TCP_CLOSE;
1887
1888 sk->sk_write_space = sk_stream_write_space;
1889 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1890
1891 icsk->icsk_af_ops = &ipv4_specific;
1892 icsk->icsk_sync_mss = tcp_sync_mss;
2152#ifdef CONFIG_TCP_MD5SIG 1893#ifdef CONFIG_TCP_MD5SIG
2153 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; 1894 tp->af_specific = &tcp_sock_ipv4_specific;
2154#endif 1895#endif
2155 1896
1897 /* TCP Cookie Transactions */
1898 if (sysctl_tcp_cookie_size > 0) {
1899 /* Default, cookies without s_data_payload. */
1900 tp->cookie_values =
1901 kzalloc(sizeof(*tp->cookie_values),
1902 sk->sk_allocation);
1903 if (tp->cookie_values != NULL)
1904 kref_init(&tp->cookie_values->kref);
1905 }
1906 /* Presumed zeroed, in order of appearance:
1907 * cookie_in_always, cookie_out_never,
1908 * s_data_constant, s_data_in, s_data_out
1909 */
1910 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1911 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1912
1913 local_bh_disable();
1914 percpu_counter_inc(&tcp_sockets_allocated);
1915 local_bh_enable();
1916
2156 return 0; 1917 return 0;
2157} 1918}
2158 1919
@@ -2173,8 +1934,8 @@ void tcp_v4_destroy_sock(struct sock *sk)
2173#ifdef CONFIG_TCP_MD5SIG 1934#ifdef CONFIG_TCP_MD5SIG
2174 /* Clean up the MD5 key list, if any */ 1935 /* Clean up the MD5 key list, if any */
2175 if (tp->md5sig_info) { 1936 if (tp->md5sig_info) {
2176 tcp_clear_md5_list(sk); 1937 tcp_v4_clear_md5_list(sk);
2177 kfree_rcu(tp->md5sig_info, rcu); 1938 kfree(tp->md5sig_info);
2178 tp->md5sig_info = NULL; 1939 tp->md5sig_info = NULL;
2179 } 1940 }
2180#endif 1941#endif
@@ -2191,19 +1952,22 @@ void tcp_v4_destroy_sock(struct sock *sk)
2191 if (inet_csk(sk)->icsk_bind_hash) 1952 if (inet_csk(sk)->icsk_bind_hash)
2192 inet_put_port(sk); 1953 inet_put_port(sk);
2193 1954
1955 /*
1956 * If sendmsg cached page exists, toss it.
1957 */
1958 if (sk->sk_sndmsg_page) {
1959 __free_page(sk->sk_sndmsg_page);
1960 sk->sk_sndmsg_page = NULL;
1961 }
1962
2194 /* TCP Cookie Transactions */ 1963 /* TCP Cookie Transactions */
2195 if (tp->cookie_values != NULL) { 1964 if (tp->cookie_values != NULL) {
2196 kref_put(&tp->cookie_values->kref, 1965 kref_put(&tp->cookie_values->kref,
2197 tcp_cookie_values_release); 1966 tcp_cookie_values_release);
2198 tp->cookie_values = NULL; 1967 tp->cookie_values = NULL;
2199 } 1968 }
2200 BUG_ON(tp->fastopen_rsk != NULL);
2201 1969
2202 /* If socket is aborted during connect operation */ 1970 percpu_counter_dec(&tcp_sockets_allocated);
2203 tcp_free_fastopen_req(tp);
2204
2205 sk_sockets_allocated_dec(sk);
2206 sock_release_memcg(sk);
2207} 1971}
2208EXPORT_SYMBOL(tcp_v4_destroy_sock); 1972EXPORT_SYMBOL(tcp_v4_destroy_sock);
2209 1973
@@ -2325,7 +2089,7 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2325 return rc; 2089 return rc;
2326} 2090}
2327 2091
2328static inline bool empty_bucket(struct tcp_iter_state *st) 2092static inline int empty_bucket(struct tcp_iter_state *st)
2329{ 2093{
2330 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) && 2094 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2331 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain); 2095 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
@@ -2572,7 +2336,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
2572 } 2336 }
2573} 2337}
2574 2338
2575int tcp_seq_open(struct inode *inode, struct file *file) 2339static int tcp_seq_open(struct inode *inode, struct file *file)
2576{ 2340{
2577 struct tcp_seq_afinfo *afinfo = PDE(inode)->data; 2341 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2578 struct tcp_iter_state *s; 2342 struct tcp_iter_state *s;
@@ -2588,19 +2352,23 @@ int tcp_seq_open(struct inode *inode, struct file *file)
2588 s->last_pos = 0; 2352 s->last_pos = 0;
2589 return 0; 2353 return 0;
2590} 2354}
2591EXPORT_SYMBOL(tcp_seq_open);
2592 2355
2593int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo) 2356int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2594{ 2357{
2595 int rc = 0; 2358 int rc = 0;
2596 struct proc_dir_entry *p; 2359 struct proc_dir_entry *p;
2597 2360
2361 afinfo->seq_fops.open = tcp_seq_open;
2362 afinfo->seq_fops.read = seq_read;
2363 afinfo->seq_fops.llseek = seq_lseek;
2364 afinfo->seq_fops.release = seq_release_net;
2365
2598 afinfo->seq_ops.start = tcp_seq_start; 2366 afinfo->seq_ops.start = tcp_seq_start;
2599 afinfo->seq_ops.next = tcp_seq_next; 2367 afinfo->seq_ops.next = tcp_seq_next;
2600 afinfo->seq_ops.stop = tcp_seq_stop; 2368 afinfo->seq_ops.stop = tcp_seq_stop;
2601 2369
2602 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net, 2370 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2603 afinfo->seq_fops, afinfo); 2371 &afinfo->seq_fops, afinfo);
2604 if (!p) 2372 if (!p)
2605 rc = -ENOMEM; 2373 rc = -ENOMEM;
2606 return rc; 2374 return rc;
@@ -2613,11 +2381,11 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2613} 2381}
2614EXPORT_SYMBOL(tcp_proc_unregister); 2382EXPORT_SYMBOL(tcp_proc_unregister);
2615 2383
2616static void get_openreq4(const struct sock *sk, const struct request_sock *req, 2384static void get_openreq4(struct sock *sk, struct request_sock *req,
2617 struct seq_file *f, int i, kuid_t uid, int *len) 2385 struct seq_file *f, int i, int uid, int *len)
2618{ 2386{
2619 const struct inet_request_sock *ireq = inet_rsk(req); 2387 const struct inet_request_sock *ireq = inet_rsk(req);
2620 long delta = req->expires - jiffies; 2388 int ttd = req->expires - jiffies;
2621 2389
2622 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2390 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2623 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n", 2391 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
@@ -2629,9 +2397,9 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2629 TCP_SYN_RECV, 2397 TCP_SYN_RECV,
2630 0, 0, /* could print option size, but that is af dependent. */ 2398 0, 0, /* could print option size, but that is af dependent. */
2631 1, /* timers active (only the expire timer) */ 2399 1, /* timers active (only the expire timer) */
2632 jiffies_delta_to_clock_t(delta), 2400 jiffies_to_clock_t(ttd),
2633 req->num_timeout, 2401 req->retrans,
2634 from_kuid_munged(seq_user_ns(f), uid), 2402 uid,
2635 0, /* non standard timer */ 2403 0, /* non standard timer */
2636 0, /* open_requests have no inode */ 2404 0, /* open_requests have no inode */
2637 atomic_read(&sk->sk_refcnt), 2405 atomic_read(&sk->sk_refcnt),
@@ -2643,10 +2411,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2643{ 2411{
2644 int timer_active; 2412 int timer_active;
2645 unsigned long timer_expires; 2413 unsigned long timer_expires;
2646 const struct tcp_sock *tp = tcp_sk(sk); 2414 struct tcp_sock *tp = tcp_sk(sk);
2647 const struct inet_connection_sock *icsk = inet_csk(sk); 2415 const struct inet_connection_sock *icsk = inet_csk(sk);
2648 const struct inet_sock *inet = inet_sk(sk); 2416 struct inet_sock *inet = inet_sk(sk);
2649 struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2650 __be32 dest = inet->inet_daddr; 2417 __be32 dest = inet->inet_daddr;
2651 __be32 src = inet->inet_rcv_saddr; 2418 __be32 src = inet->inet_rcv_saddr;
2652 __u16 destp = ntohs(inet->inet_dport); 2419 __u16 destp = ntohs(inet->inet_dport);
@@ -2681,9 +2448,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2681 tp->write_seq - tp->snd_una, 2448 tp->write_seq - tp->snd_una,
2682 rx_queue, 2449 rx_queue,
2683 timer_active, 2450 timer_active,
2684 jiffies_delta_to_clock_t(timer_expires - jiffies), 2451 jiffies_to_clock_t(timer_expires - jiffies),
2685 icsk->icsk_retransmits, 2452 icsk->icsk_retransmits,
2686 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), 2453 sock_i_uid(sk),
2687 icsk->icsk_probes_out, 2454 icsk->icsk_probes_out,
2688 sock_i_ino(sk), 2455 sock_i_ino(sk),
2689 atomic_read(&sk->sk_refcnt), sk, 2456 atomic_read(&sk->sk_refcnt), sk,
@@ -2691,18 +2458,19 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2691 jiffies_to_clock_t(icsk->icsk_ack.ato), 2458 jiffies_to_clock_t(icsk->icsk_ack.ato),
2692 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, 2459 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2693 tp->snd_cwnd, 2460 tp->snd_cwnd,
2694 sk->sk_state == TCP_LISTEN ? 2461 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2695 (fastopenq ? fastopenq->max_qlen : 0) :
2696 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
2697 len); 2462 len);
2698} 2463}
2699 2464
2700static void get_timewait4_sock(const struct inet_timewait_sock *tw, 2465static void get_timewait4_sock(struct inet_timewait_sock *tw,
2701 struct seq_file *f, int i, int *len) 2466 struct seq_file *f, int i, int *len)
2702{ 2467{
2703 __be32 dest, src; 2468 __be32 dest, src;
2704 __u16 destp, srcp; 2469 __u16 destp, srcp;
2705 long delta = tw->tw_ttd - jiffies; 2470 int ttd = tw->tw_ttd - jiffies;
2471
2472 if (ttd < 0)
2473 ttd = 0;
2706 2474
2707 dest = tw->tw_daddr; 2475 dest = tw->tw_daddr;
2708 src = tw->tw_rcv_saddr; 2476 src = tw->tw_rcv_saddr;
@@ -2712,7 +2480,7 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2712 seq_printf(f, "%4d: %08X:%04X %08X:%04X" 2480 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2713 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n", 2481 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2714 i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 2482 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2715 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, 2483 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2716 atomic_read(&tw->tw_refcnt), tw, len); 2484 atomic_read(&tw->tw_refcnt), tw, len);
2717} 2485}
2718 2486
@@ -2749,18 +2517,12 @@ out:
2749 return 0; 2517 return 0;
2750} 2518}
2751 2519
2752static const struct file_operations tcp_afinfo_seq_fops = {
2753 .owner = THIS_MODULE,
2754 .open = tcp_seq_open,
2755 .read = seq_read,
2756 .llseek = seq_lseek,
2757 .release = seq_release_net
2758};
2759
2760static struct tcp_seq_afinfo tcp4_seq_afinfo = { 2520static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2761 .name = "tcp", 2521 .name = "tcp",
2762 .family = AF_INET, 2522 .family = AF_INET,
2763 .seq_fops = &tcp_afinfo_seq_fops, 2523 .seq_fops = {
2524 .owner = THIS_MODULE,
2525 },
2764 .seq_ops = { 2526 .seq_ops = {
2765 .show = tcp4_seq_show, 2527 .show = tcp4_seq_show,
2766 }, 2528 },
@@ -2795,8 +2557,6 @@ void tcp4_proc_exit(void)
2795struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) 2557struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2796{ 2558{
2797 const struct iphdr *iph = skb_gro_network_header(skb); 2559 const struct iphdr *iph = skb_gro_network_header(skb);
2798 __wsum wsum;
2799 __sum16 sum;
2800 2560
2801 switch (skb->ip_summed) { 2561 switch (skb->ip_summed) {
2802 case CHECKSUM_COMPLETE: 2562 case CHECKSUM_COMPLETE:
@@ -2805,22 +2565,11 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2805 skb->ip_summed = CHECKSUM_UNNECESSARY; 2565 skb->ip_summed = CHECKSUM_UNNECESSARY;
2806 break; 2566 break;
2807 } 2567 }
2808flush:
2809 NAPI_GRO_CB(skb)->flush = 1;
2810 return NULL;
2811 2568
2569 /* fall through */
2812 case CHECKSUM_NONE: 2570 case CHECKSUM_NONE:
2813 wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr, 2571 NAPI_GRO_CB(skb)->flush = 1;
2814 skb_gro_len(skb), IPPROTO_TCP, 0); 2572 return NULL;
2815 sum = csum_fold(skb_checksum(skb,
2816 skb_gro_offset(skb),
2817 skb_gro_len(skb),
2818 wsum));
2819 if (sum)
2820 goto flush;
2821
2822 skb->ip_summed = CHECKSUM_UNNECESSARY;
2823 break;
2824 } 2573 }
2825 2574
2826 return tcp_gro_receive(head, skb); 2575 return tcp_gro_receive(head, skb);
@@ -2855,8 +2604,6 @@ struct proto tcp_prot = {
2855 .sendmsg = tcp_sendmsg, 2604 .sendmsg = tcp_sendmsg,
2856 .sendpage = tcp_sendpage, 2605 .sendpage = tcp_sendpage,
2857 .backlog_rcv = tcp_v4_do_rcv, 2606 .backlog_rcv = tcp_v4_do_rcv,
2858 .release_cb = tcp_release_cb,
2859 .mtu_reduced = tcp_v4_mtu_reduced,
2860 .hash = inet_hash, 2607 .hash = inet_hash,
2861 .unhash = inet_unhash, 2608 .unhash = inet_unhash,
2862 .get_port = inet_csk_get_port, 2609 .get_port = inet_csk_get_port,
@@ -2865,6 +2612,7 @@ struct proto tcp_prot = {
2865 .orphan_count = &tcp_orphan_count, 2612 .orphan_count = &tcp_orphan_count,
2866 .memory_allocated = &tcp_memory_allocated, 2613 .memory_allocated = &tcp_memory_allocated,
2867 .memory_pressure = &tcp_memory_pressure, 2614 .memory_pressure = &tcp_memory_pressure,
2615 .sysctl_mem = sysctl_tcp_mem,
2868 .sysctl_wmem = sysctl_tcp_wmem, 2616 .sysctl_wmem = sysctl_tcp_wmem,
2869 .sysctl_rmem = sysctl_tcp_rmem, 2617 .sysctl_rmem = sysctl_tcp_rmem,
2870 .max_header = MAX_TCP_HEADER, 2618 .max_header = MAX_TCP_HEADER,
@@ -2878,21 +2626,19 @@ struct proto tcp_prot = {
2878 .compat_setsockopt = compat_tcp_setsockopt, 2626 .compat_setsockopt = compat_tcp_setsockopt,
2879 .compat_getsockopt = compat_tcp_getsockopt, 2627 .compat_getsockopt = compat_tcp_getsockopt,
2880#endif 2628#endif
2881#ifdef CONFIG_MEMCG_KMEM
2882 .init_cgroup = tcp_init_cgroup,
2883 .destroy_cgroup = tcp_destroy_cgroup,
2884 .proto_cgroup = tcp_proto_cgroup,
2885#endif
2886}; 2629};
2887EXPORT_SYMBOL(tcp_prot); 2630EXPORT_SYMBOL(tcp_prot);
2888 2631
2632
2889static int __net_init tcp_sk_init(struct net *net) 2633static int __net_init tcp_sk_init(struct net *net)
2890{ 2634{
2891 return 0; 2635 return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2636 PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2892} 2637}
2893 2638
2894static void __net_exit tcp_sk_exit(struct net *net) 2639static void __net_exit tcp_sk_exit(struct net *net)
2895{ 2640{
2641 inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2896} 2642}
2897 2643
2898static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) 2644static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
deleted file mode 100644
index b6f3583ddfe..00000000000
--- a/net/ipv4/tcp_memcontrol.c
+++ /dev/null
@@ -1,291 +0,0 @@
1#include <net/tcp.h>
2#include <net/tcp_memcontrol.h>
3#include <net/sock.h>
4#include <net/ip.h>
5#include <linux/nsproxy.h>
6#include <linux/memcontrol.h>
7#include <linux/module.h>
8
9static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto)
10{
11 return container_of(cg_proto, struct tcp_memcontrol, cg_proto);
12}
13
14static void memcg_tcp_enter_memory_pressure(struct sock *sk)
15{
16 if (sk->sk_cgrp->memory_pressure)
17 *sk->sk_cgrp->memory_pressure = 1;
18}
19EXPORT_SYMBOL(memcg_tcp_enter_memory_pressure);
20
21int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
22{
23 /*
24 * The root cgroup does not use res_counters, but rather,
25 * rely on the data already collected by the network
26 * subsystem
27 */
28 struct res_counter *res_parent = NULL;
29 struct cg_proto *cg_proto, *parent_cg;
30 struct tcp_memcontrol *tcp;
31 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
32 struct net *net = current->nsproxy->net_ns;
33
34 cg_proto = tcp_prot.proto_cgroup(memcg);
35 if (!cg_proto)
36 return 0;
37
38 tcp = tcp_from_cgproto(cg_proto);
39
40 tcp->tcp_prot_mem[0] = net->ipv4.sysctl_tcp_mem[0];
41 tcp->tcp_prot_mem[1] = net->ipv4.sysctl_tcp_mem[1];
42 tcp->tcp_prot_mem[2] = net->ipv4.sysctl_tcp_mem[2];
43 tcp->tcp_memory_pressure = 0;
44
45 parent_cg = tcp_prot.proto_cgroup(parent);
46 if (parent_cg)
47 res_parent = parent_cg->memory_allocated;
48
49 res_counter_init(&tcp->tcp_memory_allocated, res_parent);
50 percpu_counter_init(&tcp->tcp_sockets_allocated, 0);
51
52 cg_proto->enter_memory_pressure = memcg_tcp_enter_memory_pressure;
53 cg_proto->memory_pressure = &tcp->tcp_memory_pressure;
54 cg_proto->sysctl_mem = tcp->tcp_prot_mem;
55 cg_proto->memory_allocated = &tcp->tcp_memory_allocated;
56 cg_proto->sockets_allocated = &tcp->tcp_sockets_allocated;
57 cg_proto->memcg = memcg;
58
59 return 0;
60}
61EXPORT_SYMBOL(tcp_init_cgroup);
62
63void tcp_destroy_cgroup(struct mem_cgroup *memcg)
64{
65 struct cg_proto *cg_proto;
66 struct tcp_memcontrol *tcp;
67 u64 val;
68
69 cg_proto = tcp_prot.proto_cgroup(memcg);
70 if (!cg_proto)
71 return;
72
73 tcp = tcp_from_cgproto(cg_proto);
74 percpu_counter_destroy(&tcp->tcp_sockets_allocated);
75
76 val = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT);
77}
78EXPORT_SYMBOL(tcp_destroy_cgroup);
79
80static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
81{
82 struct net *net = current->nsproxy->net_ns;
83 struct tcp_memcontrol *tcp;
84 struct cg_proto *cg_proto;
85 u64 old_lim;
86 int i;
87 int ret;
88
89 cg_proto = tcp_prot.proto_cgroup(memcg);
90 if (!cg_proto)
91 return -EINVAL;
92
93 if (val > RESOURCE_MAX)
94 val = RESOURCE_MAX;
95
96 tcp = tcp_from_cgproto(cg_proto);
97
98 old_lim = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT);
99 ret = res_counter_set_limit(&tcp->tcp_memory_allocated, val);
100 if (ret)
101 return ret;
102
103 for (i = 0; i < 3; i++)
104 tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT,
105 net->ipv4.sysctl_tcp_mem[i]);
106
107 if (val == RESOURCE_MAX)
108 clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
109 else if (val != RESOURCE_MAX) {
110 /*
111 * The active bit needs to be written after the static_key
112 * update. This is what guarantees that the socket activation
113 * function is the last one to run. See sock_update_memcg() for
114 * details, and note that we don't mark any socket as belonging
115 * to this memcg until that flag is up.
116 *
117 * We need to do this, because static_keys will span multiple
118 * sites, but we can't control their order. If we mark a socket
119 * as accounted, but the accounting functions are not patched in
120 * yet, we'll lose accounting.
121 *
122 * We never race with the readers in sock_update_memcg(),
123 * because when this value change, the code to process it is not
124 * patched in yet.
125 *
126 * The activated bit is used to guarantee that no two writers
127 * will do the update in the same memcg. Without that, we can't
128 * properly shutdown the static key.
129 */
130 if (!test_and_set_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags))
131 static_key_slow_inc(&memcg_socket_limit_enabled);
132 set_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
133 }
134
135 return 0;
136}
137
138static int tcp_cgroup_write(struct cgroup *cont, struct cftype *cft,
139 const char *buffer)
140{
141 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
142 unsigned long long val;
143 int ret = 0;
144
145 switch (cft->private) {
146 case RES_LIMIT:
147 /* see memcontrol.c */
148 ret = res_counter_memparse_write_strategy(buffer, &val);
149 if (ret)
150 break;
151 ret = tcp_update_limit(memcg, val);
152 break;
153 default:
154 ret = -EINVAL;
155 break;
156 }
157 return ret;
158}
159
160static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val)
161{
162 struct tcp_memcontrol *tcp;
163 struct cg_proto *cg_proto;
164
165 cg_proto = tcp_prot.proto_cgroup(memcg);
166 if (!cg_proto)
167 return default_val;
168
169 tcp = tcp_from_cgproto(cg_proto);
170 return res_counter_read_u64(&tcp->tcp_memory_allocated, type);
171}
172
173static u64 tcp_read_usage(struct mem_cgroup *memcg)
174{
175 struct tcp_memcontrol *tcp;
176 struct cg_proto *cg_proto;
177
178 cg_proto = tcp_prot.proto_cgroup(memcg);
179 if (!cg_proto)
180 return atomic_long_read(&tcp_memory_allocated) << PAGE_SHIFT;
181
182 tcp = tcp_from_cgproto(cg_proto);
183 return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE);
184}
185
186static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft)
187{
188 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
189 u64 val;
190
191 switch (cft->private) {
192 case RES_LIMIT:
193 val = tcp_read_stat(memcg, RES_LIMIT, RESOURCE_MAX);
194 break;
195 case RES_USAGE:
196 val = tcp_read_usage(memcg);
197 break;
198 case RES_FAILCNT:
199 case RES_MAX_USAGE:
200 val = tcp_read_stat(memcg, cft->private, 0);
201 break;
202 default:
203 BUG();
204 }
205 return val;
206}
207
208static int tcp_cgroup_reset(struct cgroup *cont, unsigned int event)
209{
210 struct mem_cgroup *memcg;
211 struct tcp_memcontrol *tcp;
212 struct cg_proto *cg_proto;
213
214 memcg = mem_cgroup_from_cont(cont);
215 cg_proto = tcp_prot.proto_cgroup(memcg);
216 if (!cg_proto)
217 return 0;
218 tcp = tcp_from_cgproto(cg_proto);
219
220 switch (event) {
221 case RES_MAX_USAGE:
222 res_counter_reset_max(&tcp->tcp_memory_allocated);
223 break;
224 case RES_FAILCNT:
225 res_counter_reset_failcnt(&tcp->tcp_memory_allocated);
226 break;
227 }
228
229 return 0;
230}
231
232unsigned long long tcp_max_memory(const struct mem_cgroup *memcg)
233{
234 struct tcp_memcontrol *tcp;
235 struct cg_proto *cg_proto;
236
237 cg_proto = tcp_prot.proto_cgroup((struct mem_cgroup *)memcg);
238 if (!cg_proto)
239 return 0;
240
241 tcp = tcp_from_cgproto(cg_proto);
242 return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT);
243}
244
245void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx)
246{
247 struct tcp_memcontrol *tcp;
248 struct cg_proto *cg_proto;
249
250 cg_proto = tcp_prot.proto_cgroup(memcg);
251 if (!cg_proto)
252 return;
253
254 tcp = tcp_from_cgproto(cg_proto);
255
256 tcp->tcp_prot_mem[idx] = val;
257}
258
259static struct cftype tcp_files[] = {
260 {
261 .name = "kmem.tcp.limit_in_bytes",
262 .write_string = tcp_cgroup_write,
263 .read_u64 = tcp_cgroup_read,
264 .private = RES_LIMIT,
265 },
266 {
267 .name = "kmem.tcp.usage_in_bytes",
268 .read_u64 = tcp_cgroup_read,
269 .private = RES_USAGE,
270 },
271 {
272 .name = "kmem.tcp.failcnt",
273 .private = RES_FAILCNT,
274 .trigger = tcp_cgroup_reset,
275 .read_u64 = tcp_cgroup_read,
276 },
277 {
278 .name = "kmem.tcp.max_usage_in_bytes",
279 .private = RES_MAX_USAGE,
280 .trigger = tcp_cgroup_reset,
281 .read_u64 = tcp_cgroup_read,
282 },
283 { } /* terminate */
284};
285
286static int __init tcp_memcontrol_init(void)
287{
288 WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, tcp_files));
289 return 0;
290}
291__initcall(tcp_memcontrol_init);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
deleted file mode 100644
index f696d7c2e9f..00000000000
--- a/net/ipv4/tcp_metrics.c
+++ /dev/null
@@ -1,1091 +0,0 @@
1#include <linux/rcupdate.h>
2#include <linux/spinlock.h>
3#include <linux/jiffies.h>
4#include <linux/module.h>
5#include <linux/cache.h>
6#include <linux/slab.h>
7#include <linux/init.h>
8#include <linux/tcp.h>
9#include <linux/hash.h>
10#include <linux/tcp_metrics.h>
11#include <linux/vmalloc.h>
12
13#include <net/inet_connection_sock.h>
14#include <net/net_namespace.h>
15#include <net/request_sock.h>
16#include <net/inetpeer.h>
17#include <net/sock.h>
18#include <net/ipv6.h>
19#include <net/dst.h>
20#include <net/tcp.h>
21#include <net/genetlink.h>
22
23int sysctl_tcp_nometrics_save __read_mostly;
24
25struct tcp_fastopen_metrics {
26 u16 mss;
27 u16 syn_loss:10; /* Recurring Fast Open SYN losses */
28 unsigned long last_syn_loss; /* Last Fast Open SYN loss */
29 struct tcp_fastopen_cookie cookie;
30};
31
32struct tcp_metrics_block {
33 struct tcp_metrics_block __rcu *tcpm_next;
34 struct inetpeer_addr tcpm_addr;
35 unsigned long tcpm_stamp;
36 u32 tcpm_ts;
37 u32 tcpm_ts_stamp;
38 u32 tcpm_lock;
39 u32 tcpm_vals[TCP_METRIC_MAX + 1];
40 struct tcp_fastopen_metrics tcpm_fastopen;
41
42 struct rcu_head rcu_head;
43};
44
45static bool tcp_metric_locked(struct tcp_metrics_block *tm,
46 enum tcp_metric_index idx)
47{
48 return tm->tcpm_lock & (1 << idx);
49}
50
51static u32 tcp_metric_get(struct tcp_metrics_block *tm,
52 enum tcp_metric_index idx)
53{
54 return tm->tcpm_vals[idx];
55}
56
57static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm,
58 enum tcp_metric_index idx)
59{
60 return msecs_to_jiffies(tm->tcpm_vals[idx]);
61}
62
63static void tcp_metric_set(struct tcp_metrics_block *tm,
64 enum tcp_metric_index idx,
65 u32 val)
66{
67 tm->tcpm_vals[idx] = val;
68}
69
70static void tcp_metric_set_msecs(struct tcp_metrics_block *tm,
71 enum tcp_metric_index idx,
72 u32 val)
73{
74 tm->tcpm_vals[idx] = jiffies_to_msecs(val);
75}
76
77static bool addr_same(const struct inetpeer_addr *a,
78 const struct inetpeer_addr *b)
79{
80 const struct in6_addr *a6, *b6;
81
82 if (a->family != b->family)
83 return false;
84 if (a->family == AF_INET)
85 return a->addr.a4 == b->addr.a4;
86
87 a6 = (const struct in6_addr *) &a->addr.a6[0];
88 b6 = (const struct in6_addr *) &b->addr.a6[0];
89
90 return ipv6_addr_equal(a6, b6);
91}
92
93struct tcpm_hash_bucket {
94 struct tcp_metrics_block __rcu *chain;
95};
96
97static DEFINE_SPINLOCK(tcp_metrics_lock);
98
99static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst)
100{
101 u32 val;
102
103 tm->tcpm_stamp = jiffies;
104
105 val = 0;
106 if (dst_metric_locked(dst, RTAX_RTT))
107 val |= 1 << TCP_METRIC_RTT;
108 if (dst_metric_locked(dst, RTAX_RTTVAR))
109 val |= 1 << TCP_METRIC_RTTVAR;
110 if (dst_metric_locked(dst, RTAX_SSTHRESH))
111 val |= 1 << TCP_METRIC_SSTHRESH;
112 if (dst_metric_locked(dst, RTAX_CWND))
113 val |= 1 << TCP_METRIC_CWND;
114 if (dst_metric_locked(dst, RTAX_REORDERING))
115 val |= 1 << TCP_METRIC_REORDERING;
116 tm->tcpm_lock = val;
117
118 tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT);
119 tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR);
120 tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
121 tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
122 tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
123 tm->tcpm_ts = 0;
124 tm->tcpm_ts_stamp = 0;
125 tm->tcpm_fastopen.mss = 0;
126 tm->tcpm_fastopen.syn_loss = 0;
127 tm->tcpm_fastopen.cookie.len = 0;
128}
129
130static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
131 struct inetpeer_addr *addr,
132 unsigned int hash,
133 bool reclaim)
134{
135 struct tcp_metrics_block *tm;
136 struct net *net;
137
138 spin_lock_bh(&tcp_metrics_lock);
139 net = dev_net(dst->dev);
140 if (unlikely(reclaim)) {
141 struct tcp_metrics_block *oldest;
142
143 oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain);
144 for (tm = rcu_dereference(oldest->tcpm_next); tm;
145 tm = rcu_dereference(tm->tcpm_next)) {
146 if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp))
147 oldest = tm;
148 }
149 tm = oldest;
150 } else {
151 tm = kmalloc(sizeof(*tm), GFP_ATOMIC);
152 if (!tm)
153 goto out_unlock;
154 }
155 tm->tcpm_addr = *addr;
156
157 tcpm_suck_dst(tm, dst);
158
159 if (likely(!reclaim)) {
160 tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain;
161 rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm);
162 }
163
164out_unlock:
165 spin_unlock_bh(&tcp_metrics_lock);
166 return tm;
167}
168
169#define TCP_METRICS_TIMEOUT (60 * 60 * HZ)
170
171static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst)
172{
173 if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT)))
174 tcpm_suck_dst(tm, dst);
175}
176
177#define TCP_METRICS_RECLAIM_DEPTH 5
178#define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL
179
180static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth)
181{
182 if (tm)
183 return tm;
184 if (depth > TCP_METRICS_RECLAIM_DEPTH)
185 return TCP_METRICS_RECLAIM_PTR;
186 return NULL;
187}
188
189static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *addr,
190 struct net *net, unsigned int hash)
191{
192 struct tcp_metrics_block *tm;
193 int depth = 0;
194
195 for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
196 tm = rcu_dereference(tm->tcpm_next)) {
197 if (addr_same(&tm->tcpm_addr, addr))
198 break;
199 depth++;
200 }
201 return tcp_get_encode(tm, depth);
202}
203
204static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
205 struct dst_entry *dst)
206{
207 struct tcp_metrics_block *tm;
208 struct inetpeer_addr addr;
209 unsigned int hash;
210 struct net *net;
211
212 addr.family = req->rsk_ops->family;
213 switch (addr.family) {
214 case AF_INET:
215 addr.addr.a4 = inet_rsk(req)->rmt_addr;
216 hash = (__force unsigned int) addr.addr.a4;
217 break;
218 case AF_INET6:
219 *(struct in6_addr *)addr.addr.a6 = inet6_rsk(req)->rmt_addr;
220 hash = ipv6_addr_hash(&inet6_rsk(req)->rmt_addr);
221 break;
222 default:
223 return NULL;
224 }
225
226 net = dev_net(dst->dev);
227 hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
228
229 for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
230 tm = rcu_dereference(tm->tcpm_next)) {
231 if (addr_same(&tm->tcpm_addr, &addr))
232 break;
233 }
234 tcpm_check_stamp(tm, dst);
235 return tm;
236}
237
238static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw)
239{
240 struct inet6_timewait_sock *tw6;
241 struct tcp_metrics_block *tm;
242 struct inetpeer_addr addr;
243 unsigned int hash;
244 struct net *net;
245
246 addr.family = tw->tw_family;
247 switch (addr.family) {
248 case AF_INET:
249 addr.addr.a4 = tw->tw_daddr;
250 hash = (__force unsigned int) addr.addr.a4;
251 break;
252 case AF_INET6:
253 tw6 = inet6_twsk((struct sock *)tw);
254 *(struct in6_addr *)addr.addr.a6 = tw6->tw_v6_daddr;
255 hash = ipv6_addr_hash(&tw6->tw_v6_daddr);
256 break;
257 default:
258 return NULL;
259 }
260
261 net = twsk_net(tw);
262 hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
263
264 for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
265 tm = rcu_dereference(tm->tcpm_next)) {
266 if (addr_same(&tm->tcpm_addr, &addr))
267 break;
268 }
269 return tm;
270}
271
272static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
273 struct dst_entry *dst,
274 bool create)
275{
276 struct tcp_metrics_block *tm;
277 struct inetpeer_addr addr;
278 unsigned int hash;
279 struct net *net;
280 bool reclaim;
281
282 addr.family = sk->sk_family;
283 switch (addr.family) {
284 case AF_INET:
285 addr.addr.a4 = inet_sk(sk)->inet_daddr;
286 hash = (__force unsigned int) addr.addr.a4;
287 break;
288 case AF_INET6:
289 *(struct in6_addr *)addr.addr.a6 = inet6_sk(sk)->daddr;
290 hash = ipv6_addr_hash(&inet6_sk(sk)->daddr);
291 break;
292 default:
293 return NULL;
294 }
295
296 net = dev_net(dst->dev);
297 hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
298
299 tm = __tcp_get_metrics(&addr, net, hash);
300 reclaim = false;
301 if (tm == TCP_METRICS_RECLAIM_PTR) {
302 reclaim = true;
303 tm = NULL;
304 }
305 if (!tm && create)
306 tm = tcpm_new(dst, &addr, hash, reclaim);
307 else
308 tcpm_check_stamp(tm, dst);
309
310 return tm;
311}
312
313/* Save metrics learned by this TCP session. This function is called
314 * only, when TCP finishes successfully i.e. when it enters TIME-WAIT
315 * or goes from LAST-ACK to CLOSE.
316 */
317void tcp_update_metrics(struct sock *sk)
318{
319 const struct inet_connection_sock *icsk = inet_csk(sk);
320 struct dst_entry *dst = __sk_dst_get(sk);
321 struct tcp_sock *tp = tcp_sk(sk);
322 struct tcp_metrics_block *tm;
323 unsigned long rtt;
324 u32 val;
325 int m;
326
327 if (sysctl_tcp_nometrics_save || !dst)
328 return;
329
330 if (dst->flags & DST_HOST)
331 dst_confirm(dst);
332
333 rcu_read_lock();
334 if (icsk->icsk_backoff || !tp->srtt) {
335 /* This session failed to estimate rtt. Why?
336 * Probably, no packets returned in time. Reset our
337 * results.
338 */
339 tm = tcp_get_metrics(sk, dst, false);
340 if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT))
341 tcp_metric_set(tm, TCP_METRIC_RTT, 0);
342 goto out_unlock;
343 } else
344 tm = tcp_get_metrics(sk, dst, true);
345
346 if (!tm)
347 goto out_unlock;
348
349 rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
350 m = rtt - tp->srtt;
351
352 /* If newly calculated rtt larger than stored one, store new
353 * one. Otherwise, use EWMA. Remember, rtt overestimation is
354 * always better than underestimation.
355 */
356 if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) {
357 if (m <= 0)
358 rtt = tp->srtt;
359 else
360 rtt -= (m >> 3);
361 tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt);
362 }
363
364 if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) {
365 unsigned long var;
366
367 if (m < 0)
368 m = -m;
369
370 /* Scale deviation to rttvar fixed point */
371 m >>= 1;
372 if (m < tp->mdev)
373 m = tp->mdev;
374
375 var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
376 if (m >= var)
377 var = m;
378 else
379 var -= (var - m) >> 2;
380
381 tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var);
382 }
383
384 if (tcp_in_initial_slowstart(tp)) {
385 /* Slow start still did not finish. */
386 if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
387 val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
388 if (val && (tp->snd_cwnd >> 1) > val)
389 tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
390 tp->snd_cwnd >> 1);
391 }
392 if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
393 val = tcp_metric_get(tm, TCP_METRIC_CWND);
394 if (tp->snd_cwnd > val)
395 tcp_metric_set(tm, TCP_METRIC_CWND,
396 tp->snd_cwnd);
397 }
398 } else if (tp->snd_cwnd > tp->snd_ssthresh &&
399 icsk->icsk_ca_state == TCP_CA_Open) {
400 /* Cong. avoidance phase, cwnd is reliable. */
401 if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
402 tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
403 max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
404 if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
405 val = tcp_metric_get(tm, TCP_METRIC_CWND);
406 tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_cwnd) >> 1);
407 }
408 } else {
409 /* Else slow start did not finish, cwnd is non-sense,
410 * ssthresh may be also invalid.
411 */
412 if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
413 val = tcp_metric_get(tm, TCP_METRIC_CWND);
414 tcp_metric_set(tm, TCP_METRIC_CWND,
415 (val + tp->snd_ssthresh) >> 1);
416 }
417 if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
418 val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
419 if (val && tp->snd_ssthresh > val)
420 tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
421 tp->snd_ssthresh);
422 }
423 if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) {
424 val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
425 if (val < tp->reordering &&
426 tp->reordering != sysctl_tcp_reordering)
427 tcp_metric_set(tm, TCP_METRIC_REORDERING,
428 tp->reordering);
429 }
430 }
431 tm->tcpm_stamp = jiffies;
432out_unlock:
433 rcu_read_unlock();
434}
435
436/* Initialize metrics on socket. */
437
438void tcp_init_metrics(struct sock *sk)
439{
440 struct dst_entry *dst = __sk_dst_get(sk);
441 struct tcp_sock *tp = tcp_sk(sk);
442 struct tcp_metrics_block *tm;
443 u32 val;
444
445 if (dst == NULL)
446 goto reset;
447
448 dst_confirm(dst);
449
450 rcu_read_lock();
451 tm = tcp_get_metrics(sk, dst, true);
452 if (!tm) {
453 rcu_read_unlock();
454 goto reset;
455 }
456
457 if (tcp_metric_locked(tm, TCP_METRIC_CWND))
458 tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND);
459
460 val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
461 if (val) {
462 tp->snd_ssthresh = val;
463 if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
464 tp->snd_ssthresh = tp->snd_cwnd_clamp;
465 } else {
466 /* ssthresh may have been reduced unnecessarily during.
467 * 3WHS. Restore it back to its initial default.
468 */
469 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
470 }
471 val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
472 if (val && tp->reordering != val) {
473 tcp_disable_fack(tp);
474 tcp_disable_early_retrans(tp);
475 tp->reordering = val;
476 }
477
478 val = tcp_metric_get(tm, TCP_METRIC_RTT);
479 if (val == 0 || tp->srtt == 0) {
480 rcu_read_unlock();
481 goto reset;
482 }
483 /* Initial rtt is determined from SYN,SYN-ACK.
484 * The segment is small and rtt may appear much
485 * less than real one. Use per-dst memory
486 * to make it more realistic.
487 *
488 * A bit of theory. RTT is time passed after "normal" sized packet
489 * is sent until it is ACKed. In normal circumstances sending small
490 * packets force peer to delay ACKs and calculation is correct too.
491 * The algorithm is adaptive and, provided we follow specs, it
492 * NEVER underestimate RTT. BUT! If peer tries to make some clever
493 * tricks sort of "quick acks" for time long enough to decrease RTT
494 * to low value, and then abruptly stops to do it and starts to delay
495 * ACKs, wait for troubles.
496 */
497 val = msecs_to_jiffies(val);
498 if (val > tp->srtt) {
499 tp->srtt = val;
500 tp->rtt_seq = tp->snd_nxt;
501 }
502 val = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
503 if (val > tp->mdev) {
504 tp->mdev = val;
505 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
506 }
507 rcu_read_unlock();
508
509 tcp_set_rto(sk);
510reset:
511 if (tp->srtt == 0) {
512 /* RFC6298: 5.7 We've failed to get a valid RTT sample from
513 * 3WHS. This is most likely due to retransmission,
514 * including spurious one. Reset the RTO back to 3secs
515 * from the more aggressive 1sec to avoid more spurious
516 * retransmission.
517 */
518 tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
519 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
520 }
521 /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
522 * retransmitted. In light of RFC6298 more aggressive 1sec
523 * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
524 * retransmission has occurred.
525 */
526 if (tp->total_retrans > 1)
527 tp->snd_cwnd = 1;
528 else
529 tp->snd_cwnd = tcp_init_cwnd(tp, dst);
530 tp->snd_cwnd_stamp = tcp_time_stamp;
531}
532
533bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check)
534{
535 struct tcp_metrics_block *tm;
536 bool ret;
537
538 if (!dst)
539 return false;
540
541 rcu_read_lock();
542 tm = __tcp_get_metrics_req(req, dst);
543 if (paws_check) {
544 if (tm &&
545 (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL &&
546 (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW)
547 ret = false;
548 else
549 ret = true;
550 } else {
551 if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp)
552 ret = true;
553 else
554 ret = false;
555 }
556 rcu_read_unlock();
557
558 return ret;
559}
560EXPORT_SYMBOL_GPL(tcp_peer_is_proven);
561
562void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst)
563{
564 struct tcp_metrics_block *tm;
565
566 rcu_read_lock();
567 tm = tcp_get_metrics(sk, dst, true);
568 if (tm) {
569 struct tcp_sock *tp = tcp_sk(sk);
570
571 if ((u32)get_seconds() - tm->tcpm_ts_stamp <= TCP_PAWS_MSL) {
572 tp->rx_opt.ts_recent_stamp = tm->tcpm_ts_stamp;
573 tp->rx_opt.ts_recent = tm->tcpm_ts;
574 }
575 }
576 rcu_read_unlock();
577}
578EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp);
579
580/* VJ's idea. Save last timestamp seen from this destination and hold
581 * it at least for normal timewait interval to use for duplicate
582 * segment detection in subsequent connections, before they enter
583 * synchronized state.
584 */
585bool tcp_remember_stamp(struct sock *sk)
586{
587 struct dst_entry *dst = __sk_dst_get(sk);
588 bool ret = false;
589
590 if (dst) {
591 struct tcp_metrics_block *tm;
592
593 rcu_read_lock();
594 tm = tcp_get_metrics(sk, dst, true);
595 if (tm) {
596 struct tcp_sock *tp = tcp_sk(sk);
597
598 if ((s32)(tm->tcpm_ts - tp->rx_opt.ts_recent) <= 0 ||
599 ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
600 tm->tcpm_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
601 tm->tcpm_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
602 tm->tcpm_ts = tp->rx_opt.ts_recent;
603 }
604 ret = true;
605 }
606 rcu_read_unlock();
607 }
608 return ret;
609}
610
611bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
612{
613 struct tcp_metrics_block *tm;
614 bool ret = false;
615
616 rcu_read_lock();
617 tm = __tcp_get_metrics_tw(tw);
618 if (tm) {
619 const struct tcp_timewait_sock *tcptw;
620 struct sock *sk = (struct sock *) tw;
621
622 tcptw = tcp_twsk(sk);
623 if ((s32)(tm->tcpm_ts - tcptw->tw_ts_recent) <= 0 ||
624 ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
625 tm->tcpm_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
626 tm->tcpm_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
627 tm->tcpm_ts = tcptw->tw_ts_recent;
628 }
629 ret = true;
630 }
631 rcu_read_unlock();
632
633 return ret;
634}
635
636static DEFINE_SEQLOCK(fastopen_seqlock);
637
638void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
639 struct tcp_fastopen_cookie *cookie,
640 int *syn_loss, unsigned long *last_syn_loss)
641{
642 struct tcp_metrics_block *tm;
643
644 rcu_read_lock();
645 tm = tcp_get_metrics(sk, __sk_dst_get(sk), false);
646 if (tm) {
647 struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
648 unsigned int seq;
649
650 do {
651 seq = read_seqbegin(&fastopen_seqlock);
652 if (tfom->mss)
653 *mss = tfom->mss;
654 *cookie = tfom->cookie;
655 *syn_loss = tfom->syn_loss;
656 *last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0;
657 } while (read_seqretry(&fastopen_seqlock, seq));
658 }
659 rcu_read_unlock();
660}
661
662void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
663 struct tcp_fastopen_cookie *cookie, bool syn_lost)
664{
665 struct tcp_metrics_block *tm;
666
667 rcu_read_lock();
668 tm = tcp_get_metrics(sk, __sk_dst_get(sk), true);
669 if (tm) {
670 struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
671
672 write_seqlock_bh(&fastopen_seqlock);
673 tfom->mss = mss;
674 if (cookie->len > 0)
675 tfom->cookie = *cookie;
676 if (syn_lost) {
677 ++tfom->syn_loss;
678 tfom->last_syn_loss = jiffies;
679 } else
680 tfom->syn_loss = 0;
681 write_sequnlock_bh(&fastopen_seqlock);
682 }
683 rcu_read_unlock();
684}
685
686static struct genl_family tcp_metrics_nl_family = {
687 .id = GENL_ID_GENERATE,
688 .hdrsize = 0,
689 .name = TCP_METRICS_GENL_NAME,
690 .version = TCP_METRICS_GENL_VERSION,
691 .maxattr = TCP_METRICS_ATTR_MAX,
692 .netnsok = true,
693};
694
695static struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = {
696 [TCP_METRICS_ATTR_ADDR_IPV4] = { .type = NLA_U32, },
697 [TCP_METRICS_ATTR_ADDR_IPV6] = { .type = NLA_BINARY,
698 .len = sizeof(struct in6_addr), },
699 /* Following attributes are not received for GET/DEL,
700 * we keep them for reference
701 */
702#if 0
703 [TCP_METRICS_ATTR_AGE] = { .type = NLA_MSECS, },
704 [TCP_METRICS_ATTR_TW_TSVAL] = { .type = NLA_U32, },
705 [TCP_METRICS_ATTR_TW_TS_STAMP] = { .type = NLA_S32, },
706 [TCP_METRICS_ATTR_VALS] = { .type = NLA_NESTED, },
707 [TCP_METRICS_ATTR_FOPEN_MSS] = { .type = NLA_U16, },
708 [TCP_METRICS_ATTR_FOPEN_SYN_DROPS] = { .type = NLA_U16, },
709 [TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS] = { .type = NLA_MSECS, },
710 [TCP_METRICS_ATTR_FOPEN_COOKIE] = { .type = NLA_BINARY,
711 .len = TCP_FASTOPEN_COOKIE_MAX, },
712#endif
713};
714
715/* Add attributes, caller cancels its header on failure */
716static int tcp_metrics_fill_info(struct sk_buff *msg,
717 struct tcp_metrics_block *tm)
718{
719 struct nlattr *nest;
720 int i;
721
722 switch (tm->tcpm_addr.family) {
723 case AF_INET:
724 if (nla_put_be32(msg, TCP_METRICS_ATTR_ADDR_IPV4,
725 tm->tcpm_addr.addr.a4) < 0)
726 goto nla_put_failure;
727 break;
728 case AF_INET6:
729 if (nla_put(msg, TCP_METRICS_ATTR_ADDR_IPV6, 16,
730 tm->tcpm_addr.addr.a6) < 0)
731 goto nla_put_failure;
732 break;
733 default:
734 return -EAFNOSUPPORT;
735 }
736
737 if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE,
738 jiffies - tm->tcpm_stamp) < 0)
739 goto nla_put_failure;
740 if (tm->tcpm_ts_stamp) {
741 if (nla_put_s32(msg, TCP_METRICS_ATTR_TW_TS_STAMP,
742 (s32) (get_seconds() - tm->tcpm_ts_stamp)) < 0)
743 goto nla_put_failure;
744 if (nla_put_u32(msg, TCP_METRICS_ATTR_TW_TSVAL,
745 tm->tcpm_ts) < 0)
746 goto nla_put_failure;
747 }
748
749 {
750 int n = 0;
751
752 nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS);
753 if (!nest)
754 goto nla_put_failure;
755 for (i = 0; i < TCP_METRIC_MAX + 1; i++) {
756 if (!tm->tcpm_vals[i])
757 continue;
758 if (nla_put_u32(msg, i + 1, tm->tcpm_vals[i]) < 0)
759 goto nla_put_failure;
760 n++;
761 }
762 if (n)
763 nla_nest_end(msg, nest);
764 else
765 nla_nest_cancel(msg, nest);
766 }
767
768 {
769 struct tcp_fastopen_metrics tfom_copy[1], *tfom;
770 unsigned int seq;
771
772 do {
773 seq = read_seqbegin(&fastopen_seqlock);
774 tfom_copy[0] = tm->tcpm_fastopen;
775 } while (read_seqretry(&fastopen_seqlock, seq));
776
777 tfom = tfom_copy;
778 if (tfom->mss &&
779 nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_MSS,
780 tfom->mss) < 0)
781 goto nla_put_failure;
782 if (tfom->syn_loss &&
783 (nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROPS,
784 tfom->syn_loss) < 0 ||
785 nla_put_msecs(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS,
786 jiffies - tfom->last_syn_loss) < 0))
787 goto nla_put_failure;
788 if (tfom->cookie.len > 0 &&
789 nla_put(msg, TCP_METRICS_ATTR_FOPEN_COOKIE,
790 tfom->cookie.len, tfom->cookie.val) < 0)
791 goto nla_put_failure;
792 }
793
794 return 0;
795
796nla_put_failure:
797 return -EMSGSIZE;
798}
799
800static int tcp_metrics_dump_info(struct sk_buff *skb,
801 struct netlink_callback *cb,
802 struct tcp_metrics_block *tm)
803{
804 void *hdr;
805
806 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
807 &tcp_metrics_nl_family, NLM_F_MULTI,
808 TCP_METRICS_CMD_GET);
809 if (!hdr)
810 return -EMSGSIZE;
811
812 if (tcp_metrics_fill_info(skb, tm) < 0)
813 goto nla_put_failure;
814
815 return genlmsg_end(skb, hdr);
816
817nla_put_failure:
818 genlmsg_cancel(skb, hdr);
819 return -EMSGSIZE;
820}
821
822static int tcp_metrics_nl_dump(struct sk_buff *skb,
823 struct netlink_callback *cb)
824{
825 struct net *net = sock_net(skb->sk);
826 unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log;
827 unsigned int row, s_row = cb->args[0];
828 int s_col = cb->args[1], col = s_col;
829
830 for (row = s_row; row < max_rows; row++, s_col = 0) {
831 struct tcp_metrics_block *tm;
832 struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash + row;
833
834 rcu_read_lock();
835 for (col = 0, tm = rcu_dereference(hb->chain); tm;
836 tm = rcu_dereference(tm->tcpm_next), col++) {
837 if (col < s_col)
838 continue;
839 if (tcp_metrics_dump_info(skb, cb, tm) < 0) {
840 rcu_read_unlock();
841 goto done;
842 }
843 }
844 rcu_read_unlock();
845 }
846
847done:
848 cb->args[0] = row;
849 cb->args[1] = col;
850 return skb->len;
851}
852
853static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
854 unsigned int *hash, int optional)
855{
856 struct nlattr *a;
857
858 a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV4];
859 if (a) {
860 addr->family = AF_INET;
861 addr->addr.a4 = nla_get_be32(a);
862 *hash = (__force unsigned int) addr->addr.a4;
863 return 0;
864 }
865 a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV6];
866 if (a) {
867 if (nla_len(a) != sizeof(struct in6_addr))
868 return -EINVAL;
869 addr->family = AF_INET6;
870 memcpy(addr->addr.a6, nla_data(a), sizeof(addr->addr.a6));
871 *hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6);
872 return 0;
873 }
874 return optional ? 1 : -EAFNOSUPPORT;
875}
876
877static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info)
878{
879 struct tcp_metrics_block *tm;
880 struct inetpeer_addr addr;
881 unsigned int hash;
882 struct sk_buff *msg;
883 struct net *net = genl_info_net(info);
884 void *reply;
885 int ret;
886
887 ret = parse_nl_addr(info, &addr, &hash, 0);
888 if (ret < 0)
889 return ret;
890
891 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
892 if (!msg)
893 return -ENOMEM;
894
895 reply = genlmsg_put_reply(msg, info, &tcp_metrics_nl_family, 0,
896 info->genlhdr->cmd);
897 if (!reply)
898 goto nla_put_failure;
899
900 hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
901 ret = -ESRCH;
902 rcu_read_lock();
903 for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
904 tm = rcu_dereference(tm->tcpm_next)) {
905 if (addr_same(&tm->tcpm_addr, &addr)) {
906 ret = tcp_metrics_fill_info(msg, tm);
907 break;
908 }
909 }
910 rcu_read_unlock();
911 if (ret < 0)
912 goto out_free;
913
914 genlmsg_end(msg, reply);
915 return genlmsg_reply(msg, info);
916
917nla_put_failure:
918 ret = -EMSGSIZE;
919
920out_free:
921 nlmsg_free(msg);
922 return ret;
923}
924
925#define deref_locked_genl(p) \
926 rcu_dereference_protected(p, lockdep_genl_is_held() && \
927 lockdep_is_held(&tcp_metrics_lock))
928
929#define deref_genl(p) rcu_dereference_protected(p, lockdep_genl_is_held())
930
931static int tcp_metrics_flush_all(struct net *net)
932{
933 unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log;
934 struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash;
935 struct tcp_metrics_block *tm;
936 unsigned int row;
937
938 for (row = 0; row < max_rows; row++, hb++) {
939 spin_lock_bh(&tcp_metrics_lock);
940 tm = deref_locked_genl(hb->chain);
941 if (tm)
942 hb->chain = NULL;
943 spin_unlock_bh(&tcp_metrics_lock);
944 while (tm) {
945 struct tcp_metrics_block *next;
946
947 next = deref_genl(tm->tcpm_next);
948 kfree_rcu(tm, rcu_head);
949 tm = next;
950 }
951 }
952 return 0;
953}
954
955static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
956{
957 struct tcpm_hash_bucket *hb;
958 struct tcp_metrics_block *tm;
959 struct tcp_metrics_block __rcu **pp;
960 struct inetpeer_addr addr;
961 unsigned int hash;
962 struct net *net = genl_info_net(info);
963 int ret;
964
965 ret = parse_nl_addr(info, &addr, &hash, 1);
966 if (ret < 0)
967 return ret;
968 if (ret > 0)
969 return tcp_metrics_flush_all(net);
970
971 hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
972 hb = net->ipv4.tcp_metrics_hash + hash;
973 pp = &hb->chain;
974 spin_lock_bh(&tcp_metrics_lock);
975 for (tm = deref_locked_genl(*pp); tm;
976 pp = &tm->tcpm_next, tm = deref_locked_genl(*pp)) {
977 if (addr_same(&tm->tcpm_addr, &addr)) {
978 *pp = tm->tcpm_next;
979 break;
980 }
981 }
982 spin_unlock_bh(&tcp_metrics_lock);
983 if (!tm)
984 return -ESRCH;
985 kfree_rcu(tm, rcu_head);
986 return 0;
987}
988
989static struct genl_ops tcp_metrics_nl_ops[] = {
990 {
991 .cmd = TCP_METRICS_CMD_GET,
992 .doit = tcp_metrics_nl_cmd_get,
993 .dumpit = tcp_metrics_nl_dump,
994 .policy = tcp_metrics_nl_policy,
995 .flags = GENL_ADMIN_PERM,
996 },
997 {
998 .cmd = TCP_METRICS_CMD_DEL,
999 .doit = tcp_metrics_nl_cmd_del,
1000 .policy = tcp_metrics_nl_policy,
1001 .flags = GENL_ADMIN_PERM,
1002 },
1003};
1004
1005static unsigned int tcpmhash_entries;
1006static int __init set_tcpmhash_entries(char *str)
1007{
1008 ssize_t ret;
1009
1010 if (!str)
1011 return 0;
1012
1013 ret = kstrtouint(str, 0, &tcpmhash_entries);
1014 if (ret)
1015 return 0;
1016
1017 return 1;
1018}
1019__setup("tcpmhash_entries=", set_tcpmhash_entries);
1020
1021static int __net_init tcp_net_metrics_init(struct net *net)
1022{
1023 size_t size;
1024 unsigned int slots;
1025
1026 slots = tcpmhash_entries;
1027 if (!slots) {
1028 if (totalram_pages >= 128 * 1024)
1029 slots = 16 * 1024;
1030 else
1031 slots = 8 * 1024;
1032 }
1033
1034 net->ipv4.tcp_metrics_hash_log = order_base_2(slots);
1035 size = sizeof(struct tcpm_hash_bucket) << net->ipv4.tcp_metrics_hash_log;
1036
1037 net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
1038 if (!net->ipv4.tcp_metrics_hash)
1039 net->ipv4.tcp_metrics_hash = vzalloc(size);
1040
1041 if (!net->ipv4.tcp_metrics_hash)
1042 return -ENOMEM;
1043
1044 return 0;
1045}
1046
1047static void __net_exit tcp_net_metrics_exit(struct net *net)
1048{
1049 unsigned int i;
1050
1051 for (i = 0; i < (1U << net->ipv4.tcp_metrics_hash_log) ; i++) {
1052 struct tcp_metrics_block *tm, *next;
1053
1054 tm = rcu_dereference_protected(net->ipv4.tcp_metrics_hash[i].chain, 1);
1055 while (tm) {
1056 next = rcu_dereference_protected(tm->tcpm_next, 1);
1057 kfree(tm);
1058 tm = next;
1059 }
1060 }
1061 if (is_vmalloc_addr(net->ipv4.tcp_metrics_hash))
1062 vfree(net->ipv4.tcp_metrics_hash);
1063 else
1064 kfree(net->ipv4.tcp_metrics_hash);
1065}
1066
1067static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
1068 .init = tcp_net_metrics_init,
1069 .exit = tcp_net_metrics_exit,
1070};
1071
1072void __init tcp_metrics_init(void)
1073{
1074 int ret;
1075
1076 ret = register_pernet_subsys(&tcp_net_metrics_ops);
1077 if (ret < 0)
1078 goto cleanup;
1079 ret = genl_register_family_with_ops(&tcp_metrics_nl_family,
1080 tcp_metrics_nl_ops,
1081 ARRAY_SIZE(tcp_metrics_nl_ops));
1082 if (ret < 0)
1083 goto cleanup_subsys;
1084 return;
1085
1086cleanup_subsys:
1087 unregister_pernet_subsys(&tcp_net_metrics_ops);
1088
1089cleanup:
1090 return;
1091}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f35f2dfb640..0ce3d06dce6 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -49,12 +49,62 @@ struct inet_timewait_death_row tcp_death_row = {
49}; 49};
50EXPORT_SYMBOL_GPL(tcp_death_row); 50EXPORT_SYMBOL_GPL(tcp_death_row);
51 51
52static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) 52/* VJ's idea. Save last timestamp seen from this destination
53 * and hold it at least for normal timewait interval to use for duplicate
54 * segment detection in subsequent connections, before they enter synchronized
55 * state.
56 */
57
58static int tcp_remember_stamp(struct sock *sk)
59{
60 const struct inet_connection_sock *icsk = inet_csk(sk);
61 struct tcp_sock *tp = tcp_sk(sk);
62 struct inet_peer *peer;
63 bool release_it;
64
65 peer = icsk->icsk_af_ops->get_peer(sk, &release_it);
66 if (peer) {
67 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
68 ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
69 peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
70 peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
71 peer->tcp_ts = tp->rx_opt.ts_recent;
72 }
73 if (release_it)
74 inet_putpeer(peer);
75 return 1;
76 }
77
78 return 0;
79}
80
81static int tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
82{
83 struct sock *sk = (struct sock *) tw;
84 struct inet_peer *peer;
85
86 peer = twsk_getpeer(sk);
87 if (peer) {
88 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
89
90 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
91 ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
92 peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
93 peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
94 peer->tcp_ts = tcptw->tw_ts_recent;
95 }
96 inet_putpeer(peer);
97 return 1;
98 }
99 return 0;
100}
101
102static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
53{ 103{
54 if (seq == s_win) 104 if (seq == s_win)
55 return true; 105 return 1;
56 if (after(end_seq, s_win) && before(seq, e_win)) 106 if (after(end_seq, s_win) && before(seq, e_win))
57 return true; 107 return 1;
58 return seq == e_win && seq == end_seq; 108 return seq == e_win && seq == end_seq;
59} 109}
60 110
@@ -85,21 +135,19 @@ static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
85 * spinlock it. I do not want! Well, probability of misbehaviour 135 * spinlock it. I do not want! Well, probability of misbehaviour
86 * is ridiculously low and, seems, we could use some mb() tricks 136 * is ridiculously low and, seems, we could use some mb() tricks
87 * to avoid misread sequence numbers, states etc. --ANK 137 * to avoid misread sequence numbers, states etc. --ANK
88 *
89 * We don't need to initialize tmp_out.sack_ok as we don't use the results
90 */ 138 */
91enum tcp_tw_status 139enum tcp_tw_status
92tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, 140tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
93 const struct tcphdr *th) 141 const struct tcphdr *th)
94{ 142{
95 struct tcp_options_received tmp_opt; 143 struct tcp_options_received tmp_opt;
96 const u8 *hash_location; 144 u8 *hash_location;
97 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 145 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
98 bool paws_reject = false; 146 int paws_reject = 0;
99 147
100 tmp_opt.saw_tstamp = 0; 148 tmp_opt.saw_tstamp = 0;
101 if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { 149 if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
102 tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); 150 tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
103 151
104 if (tmp_opt.saw_tstamp) { 152 if (tmp_opt.saw_tstamp) {
105 tmp_opt.ts_recent = tcptw->tw_ts_recent; 153 tmp_opt.ts_recent = tcptw->tw_ts_recent;
@@ -268,7 +316,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
268 struct inet_timewait_sock *tw = NULL; 316 struct inet_timewait_sock *tw = NULL;
269 const struct inet_connection_sock *icsk = inet_csk(sk); 317 const struct inet_connection_sock *icsk = inet_csk(sk);
270 const struct tcp_sock *tp = tcp_sk(sk); 318 const struct tcp_sock *tp = tcp_sk(sk);
271 bool recycle_ok = false; 319 int recycle_ok = 0;
272 320
273 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) 321 if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
274 recycle_ok = tcp_remember_stamp(sk); 322 recycle_ok = tcp_remember_stamp(sk);
@@ -279,9 +327,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
279 if (tw != NULL) { 327 if (tw != NULL) {
280 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 328 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
281 const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); 329 const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
282 struct inet_sock *inet = inet_sk(sk);
283 330
284 tw->tw_transparent = inet->transparent; 331 tw->tw_transparent = inet_sk(sk)->transparent;
285 tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; 332 tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
286 tcptw->tw_rcv_nxt = tp->rcv_nxt; 333 tcptw->tw_rcv_nxt = tp->rcv_nxt;
287 tcptw->tw_snd_nxt = tp->snd_nxt; 334 tcptw->tw_snd_nxt = tp->snd_nxt;
@@ -289,16 +336,15 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
289 tcptw->tw_ts_recent = tp->rx_opt.ts_recent; 336 tcptw->tw_ts_recent = tp->rx_opt.ts_recent;
290 tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; 337 tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
291 338
292#if IS_ENABLED(CONFIG_IPV6) 339#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
293 if (tw->tw_family == PF_INET6) { 340 if (tw->tw_family == PF_INET6) {
294 struct ipv6_pinfo *np = inet6_sk(sk); 341 struct ipv6_pinfo *np = inet6_sk(sk);
295 struct inet6_timewait_sock *tw6; 342 struct inet6_timewait_sock *tw6;
296 343
297 tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot); 344 tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
298 tw6 = inet6_twsk((struct sock *)tw); 345 tw6 = inet6_twsk((struct sock *)tw);
299 tw6->tw_v6_daddr = np->daddr; 346 ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr);
300 tw6->tw_v6_rcv_saddr = np->rcv_saddr; 347 ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr);
301 tw->tw_tclass = np->tclass;
302 tw->tw_ipv6only = np->ipv6only; 348 tw->tw_ipv6only = np->ipv6only;
303 } 349 }
304#endif 350#endif
@@ -312,11 +358,13 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
312 */ 358 */
313 do { 359 do {
314 struct tcp_md5sig_key *key; 360 struct tcp_md5sig_key *key;
315 tcptw->tw_md5_key = NULL; 361 memset(tcptw->tw_md5_key, 0, sizeof(tcptw->tw_md5_key));
362 tcptw->tw_md5_keylen = 0;
316 key = tp->af_specific->md5_lookup(sk, sk); 363 key = tp->af_specific->md5_lookup(sk, sk);
317 if (key != NULL) { 364 if (key != NULL) {
318 tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC); 365 memcpy(&tcptw->tw_md5_key, key->key, key->keylen);
319 if (tcptw->tw_md5_key && tcp_alloc_md5sig_pool(sk) == NULL) 366 tcptw->tw_md5_keylen = key->keylen;
367 if (tcp_alloc_md5sig_pool(sk) == NULL)
320 BUG(); 368 BUG();
321 } 369 }
322 } while (0); 370 } while (0);
@@ -356,11 +404,8 @@ void tcp_twsk_destructor(struct sock *sk)
356{ 404{
357#ifdef CONFIG_TCP_MD5SIG 405#ifdef CONFIG_TCP_MD5SIG
358 struct tcp_timewait_sock *twsk = tcp_twsk(sk); 406 struct tcp_timewait_sock *twsk = tcp_twsk(sk);
359 407 if (twsk->tw_md5_keylen)
360 if (twsk->tw_md5_key) {
361 tcp_free_md5sig_pool(); 408 tcp_free_md5sig_pool();
362 kfree_rcu(twsk->tw_md5_key, rcu);
363 }
364#endif 409#endif
365} 410}
366EXPORT_SYMBOL_GPL(tcp_twsk_destructor); 411EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
@@ -379,7 +424,7 @@ static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,
379 */ 424 */
380struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb) 425struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
381{ 426{
382 struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); 427 struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
383 428
384 if (newsk != NULL) { 429 if (newsk != NULL) {
385 const struct inet_request_sock *ireq = inet_rsk(req); 430 const struct inet_request_sock *ireq = inet_rsk(req);
@@ -424,7 +469,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
424 treq->snt_isn + 1 + tcp_s_data_size(oldtp); 469 treq->snt_isn + 1 + tcp_s_data_size(oldtp);
425 470
426 tcp_prequeue_init(newtp); 471 tcp_prequeue_init(newtp);
427 INIT_LIST_HEAD(&newtp->tsq_node);
428 472
429 tcp_init_wl(newtp, treq->rcv_isn); 473 tcp_init_wl(newtp, treq->rcv_isn);
430 474
@@ -437,7 +481,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
437 newtp->sacked_out = 0; 481 newtp->sacked_out = 0;
438 newtp->fackets_out = 0; 482 newtp->fackets_out = 0;
439 newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; 483 newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
440 tcp_enable_early_retrans(newtp);
441 484
442 /* So many TCP implementations out there (incorrectly) count the 485 /* So many TCP implementations out there (incorrectly) count the
443 * initial SYN frame in their delayed-ACK and congestion control 486 * initial SYN frame in their delayed-ACK and congestion control
@@ -451,9 +494,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
451 newtp->frto_counter = 0; 494 newtp->frto_counter = 0;
452 newtp->frto_highmark = 0; 495 newtp->frto_highmark = 0;
453 496
454 if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops && 497 newicsk->icsk_ca_ops = &tcp_init_congestion_ops;
455 !try_module_get(newicsk->icsk_ca_ops->owner))
456 newicsk->icsk_ca_ops = &tcp_init_congestion_ops;
457 498
458 tcp_set_ca_state(newsk, TCP_CA_Open); 499 tcp_set_ca_state(newsk, TCP_CA_Open);
459 tcp_init_xmit_timers(newsk); 500 tcp_init_xmit_timers(newsk);
@@ -509,8 +550,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
509 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; 550 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
510 newtp->rx_opt.mss_clamp = req->mss; 551 newtp->rx_opt.mss_clamp = req->mss;
511 TCP_ECN_openreq_child(newtp, req); 552 TCP_ECN_openreq_child(newtp, req);
512 newtp->fastopen_rsk = NULL;
513 newtp->syn_data_acked = 0;
514 553
515 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); 554 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
516 } 555 }
@@ -519,33 +558,24 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
519EXPORT_SYMBOL(tcp_create_openreq_child); 558EXPORT_SYMBOL(tcp_create_openreq_child);
520 559
521/* 560/*
522 * Process an incoming packet for SYN_RECV sockets represented as a 561 * Process an incoming packet for SYN_RECV sockets represented
523 * request_sock. Normally sk is the listener socket but for TFO it 562 * as a request_sock.
524 * points to the child socket.
525 *
526 * XXX (TFO) - The current impl contains a special check for ack
527 * validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
528 *
529 * We don't need to initialize tmp_opt.sack_ok as we don't use the results
530 */ 563 */
531 564
532struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, 565struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
533 struct request_sock *req, 566 struct request_sock *req,
534 struct request_sock **prev, 567 struct request_sock **prev)
535 bool fastopen)
536{ 568{
537 struct tcp_options_received tmp_opt; 569 struct tcp_options_received tmp_opt;
538 const u8 *hash_location; 570 u8 *hash_location;
539 struct sock *child; 571 struct sock *child;
540 const struct tcphdr *th = tcp_hdr(skb); 572 const struct tcphdr *th = tcp_hdr(skb);
541 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); 573 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
542 bool paws_reject = false; 574 int paws_reject = 0;
543
544 BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));
545 575
546 tmp_opt.saw_tstamp = 0; 576 tmp_opt.saw_tstamp = 0;
547 if (th->doff > (sizeof(struct tcphdr)>>2)) { 577 if (th->doff > (sizeof(struct tcphdr)>>2)) {
548 tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); 578 tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
549 579
550 if (tmp_opt.saw_tstamp) { 580 if (tmp_opt.saw_tstamp) {
551 tmp_opt.ts_recent = req->ts_recent; 581 tmp_opt.ts_recent = req->ts_recent;
@@ -553,7 +583,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
553 * it can be estimated (approximately) 583 * it can be estimated (approximately)
554 * from another data. 584 * from another data.
555 */ 585 */
556 tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout); 586 tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
557 paws_reject = tcp_paws_reject(&tmp_opt, th->rst); 587 paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
558 } 588 }
559 } 589 }
@@ -578,11 +608,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
578 * 608 *
579 * Enforce "SYN-ACK" according to figure 8, figure 6 609 * Enforce "SYN-ACK" according to figure 8, figure 6
580 * of RFC793, fixed by RFC1122. 610 * of RFC793, fixed by RFC1122.
581 *
582 * Note that even if there is new data in the SYN packet
583 * they will be thrown away too.
584 */ 611 */
585 inet_rtx_syn_ack(sk, req); 612 req->rsk_ops->rtx_syn_ack(sk, req, NULL);
586 return NULL; 613 return NULL;
587 } 614 }
588 615
@@ -638,12 +665,9 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
638 * sent (the segment carries an unacceptable ACK) ... 665 * sent (the segment carries an unacceptable ACK) ...
639 * a reset is sent." 666 * a reset is sent."
640 * 667 *
641 * Invalid ACK: reset will be sent by listening socket. 668 * Invalid ACK: reset will be sent by listening socket
642 * Note that the ACK validity check for a Fast Open socket is done
643 * elsewhere and is checked directly against the child socket rather
644 * than req because user data may have been sent out.
645 */ 669 */
646 if ((flg & TCP_FLAG_ACK) && !fastopen && 670 if ((flg & TCP_FLAG_ACK) &&
647 (TCP_SKB_CB(skb)->ack_seq != 671 (TCP_SKB_CB(skb)->ack_seq !=
648 tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk)))) 672 tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk))))
649 return sk; 673 return sk;
@@ -656,7 +680,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
656 /* RFC793: "first check sequence number". */ 680 /* RFC793: "first check sequence number". */
657 681
658 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, 682 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
659 tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) { 683 tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) {
660 /* Out of window: send ACK and drop. */ 684 /* Out of window: send ACK and drop. */
661 if (!(flg & TCP_FLAG_RST)) 685 if (!(flg & TCP_FLAG_RST))
662 req->rsk_ops->send_ack(sk, skb, req); 686 req->rsk_ops->send_ack(sk, skb, req);
@@ -667,7 +691,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
667 691
668 /* In sequence, PAWS is OK. */ 692 /* In sequence, PAWS is OK. */
669 693
670 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) 694 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1))
671 req->ts_recent = tmp_opt.rcv_tsval; 695 req->ts_recent = tmp_opt.rcv_tsval;
672 696
673 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { 697 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
@@ -686,32 +710,21 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
686 710
687 /* ACK sequence verified above, just make sure ACK is 711 /* ACK sequence verified above, just make sure ACK is
688 * set. If ACK not set, just silently drop the packet. 712 * set. If ACK not set, just silently drop the packet.
689 *
690 * XXX (TFO) - if we ever allow "data after SYN", the
691 * following check needs to be removed.
692 */ 713 */
693 if (!(flg & TCP_FLAG_ACK)) 714 if (!(flg & TCP_FLAG_ACK))
694 return NULL; 715 return NULL;
695 716
696 /* Got ACK for our SYNACK, so update baseline for SYNACK RTT sample. */
697 if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr)
698 tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr;
699 else if (req->num_retrans) /* don't take RTT sample if retrans && ~TS */
700 tcp_rsk(req)->snt_synack = 0;
701
702 /* For Fast Open no more processing is needed (sk is the
703 * child socket).
704 */
705 if (fastopen)
706 return sk;
707
708 /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ 717 /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
709 if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && 718 if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
710 TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { 719 TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
711 inet_rsk(req)->acked = 1; 720 inet_rsk(req)->acked = 1;
712 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); 721 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
713 return NULL; 722 return NULL;
714 } 723 }
724 if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr)
725 tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr;
726 else if (req->retrans) /* don't take RTT sample if retrans && ~TS */
727 tcp_rsk(req)->snt_synack = 0;
715 728
716 /* OK, ACK is valid, create big socket and 729 /* OK, ACK is valid, create big socket and
717 * feed this segment to it. It will repeat all 730 * feed this segment to it. It will repeat all
@@ -736,21 +749,11 @@ listen_overflow:
736 } 749 }
737 750
738embryonic_reset: 751embryonic_reset:
739 if (!(flg & TCP_FLAG_RST)) { 752 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
740 /* Received a bad SYN pkt - for TFO We try not to reset 753 if (!(flg & TCP_FLAG_RST))
741 * the local connection unless it's really necessary to
742 * avoid becoming vulnerable to outside attack aiming at
743 * resetting legit local connections.
744 */
745 req->rsk_ops->send_reset(sk, skb); 754 req->rsk_ops->send_reset(sk, skb);
746 } else if (fastopen) { /* received a valid RST pkt */ 755
747 reqsk_fastopen_remove(sk, req, true); 756 inet_csk_reqsk_queue_drop(sk, req, prev);
748 tcp_reset(sk);
749 }
750 if (!fastopen) {
751 inet_csk_reqsk_queue_drop(sk, req, prev);
752 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
753 }
754 return NULL; 757 return NULL;
755} 758}
756EXPORT_SYMBOL(tcp_check_req); 759EXPORT_SYMBOL(tcp_check_req);
@@ -759,12 +762,6 @@ EXPORT_SYMBOL(tcp_check_req);
759 * Queue segment on the new socket if the new socket is active, 762 * Queue segment on the new socket if the new socket is active,
760 * otherwise we just shortcircuit this and continue with 763 * otherwise we just shortcircuit this and continue with
761 * the new socket. 764 * the new socket.
762 *
763 * For the vast majority of cases child->sk_state will be TCP_SYN_RECV
764 * when entering. But other states are possible due to a race condition
765 * where after __inet_lookup_established() fails but before the listener
766 * locked is obtained, other packets cause the same connection to
767 * be created.
768 */ 765 */
769 766
770int tcp_child_process(struct sock *parent, struct sock *child, 767int tcp_child_process(struct sock *parent, struct sock *child,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5d451593ef1..faf257b9415 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -34,8 +34,6 @@
34 * 34 *
35 */ 35 */
36 36
37#define pr_fmt(fmt) "TCP: " fmt
38
39#include <net/tcp.h> 37#include <net/tcp.h>
40 38
41#include <linux/compiler.h> 39#include <linux/compiler.h>
@@ -50,9 +48,6 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1;
50 */ 48 */
51int sysctl_tcp_workaround_signed_windows __read_mostly = 0; 49int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
52 50
53/* Default TSQ limit of two TSO segments */
54int sysctl_tcp_limit_output_bytes __read_mostly = 131072;
55
56/* This limits the percentage of the congestion window which we 51/* This limits the percentage of the congestion window which we
57 * will allow a single TSO frame to consume. Building TSO frames 52 * will allow a single TSO frame to consume. Building TSO frames
58 * which are too large can cause TCP streams to be bursty. 53 * which are too large can cause TCP streams to be bursty.
@@ -68,11 +63,9 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
68int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */ 63int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
69EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); 64EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
70 65
71static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
72 int push_one, gfp_t gfp);
73 66
74/* Account for new data that has been sent to the network. */ 67/* Account for new data that has been sent to the network. */
75static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) 68static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
76{ 69{
77 struct tcp_sock *tp = tcp_sk(sk); 70 struct tcp_sock *tp = tcp_sk(sk);
78 unsigned int prior_packets = tp->packets_out; 71 unsigned int prior_packets = tp->packets_out;
@@ -85,8 +78,9 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
85 tp->frto_counter = 3; 78 tp->frto_counter = 3;
86 79
87 tp->packets_out += tcp_skb_pcount(skb); 80 tp->packets_out += tcp_skb_pcount(skb);
88 if (!prior_packets || tp->early_retrans_delayed) 81 if (!prior_packets)
89 tcp_rearm_rto(sk); 82 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
83 inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
90} 84}
91 85
92/* SND.NXT, if window was not shrunk. 86/* SND.NXT, if window was not shrunk.
@@ -95,9 +89,9 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
95 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already 89 * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
96 * invalid. OK, let's make this for now: 90 * invalid. OK, let's make this for now:
97 */ 91 */
98static inline __u32 tcp_acceptable_seq(const struct sock *sk) 92static inline __u32 tcp_acceptable_seq(struct sock *sk)
99{ 93{
100 const struct tcp_sock *tp = tcp_sk(sk); 94 struct tcp_sock *tp = tcp_sk(sk);
101 95
102 if (!before(tcp_wnd_end(tp), tp->snd_nxt)) 96 if (!before(tcp_wnd_end(tp), tp->snd_nxt))
103 return tp->snd_nxt; 97 return tp->snd_nxt;
@@ -122,7 +116,7 @@ static inline __u32 tcp_acceptable_seq(const struct sock *sk)
122static __u16 tcp_advertise_mss(struct sock *sk) 116static __u16 tcp_advertise_mss(struct sock *sk)
123{ 117{
124 struct tcp_sock *tp = tcp_sk(sk); 118 struct tcp_sock *tp = tcp_sk(sk);
125 const struct dst_entry *dst = __sk_dst_get(sk); 119 struct dst_entry *dst = __sk_dst_get(sk);
126 int mss = tp->advmss; 120 int mss = tp->advmss;
127 121
128 if (dst) { 122 if (dst) {
@@ -139,7 +133,7 @@ static __u16 tcp_advertise_mss(struct sock *sk)
139 133
140/* RFC2861. Reset CWND after idle period longer RTO to "restart window". 134/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
141 * This is the first part of cwnd validation mechanism. */ 135 * This is the first part of cwnd validation mechanism. */
142static void tcp_cwnd_restart(struct sock *sk, const struct dst_entry *dst) 136static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)
143{ 137{
144 struct tcp_sock *tp = tcp_sk(sk); 138 struct tcp_sock *tp = tcp_sk(sk);
145 s32 delta = tcp_time_stamp - tp->lsndtime; 139 s32 delta = tcp_time_stamp - tp->lsndtime;
@@ -160,7 +154,7 @@ static void tcp_cwnd_restart(struct sock *sk, const struct dst_entry *dst)
160 154
161/* Congestion state accounting after a packet has been sent. */ 155/* Congestion state accounting after a packet has been sent. */
162static void tcp_event_data_sent(struct tcp_sock *tp, 156static void tcp_event_data_sent(struct tcp_sock *tp,
163 struct sock *sk) 157 struct sk_buff *skb, struct sock *sk)
164{ 158{
165 struct inet_connection_sock *icsk = inet_csk(sk); 159 struct inet_connection_sock *icsk = inet_csk(sk);
166 const u32 now = tcp_time_stamp; 160 const u32 now = tcp_time_stamp;
@@ -301,11 +295,11 @@ static u16 tcp_select_window(struct sock *sk)
301} 295}
302 296
303/* Packet ECN state for a SYN-ACK */ 297/* Packet ECN state for a SYN-ACK */
304static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb) 298static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb)
305{ 299{
306 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; 300 TCP_SKB_CB(skb)->flags &= ~TCPHDR_CWR;
307 if (!(tp->ecn_flags & TCP_ECN_OK)) 301 if (!(tp->ecn_flags & TCP_ECN_OK))
308 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; 302 TCP_SKB_CB(skb)->flags &= ~TCPHDR_ECE;
309} 303}
310 304
311/* Packet ECN state for a SYN. */ 305/* Packet ECN state for a SYN. */
@@ -315,13 +309,13 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
315 309
316 tp->ecn_flags = 0; 310 tp->ecn_flags = 0;
317 if (sysctl_tcp_ecn == 1) { 311 if (sysctl_tcp_ecn == 1) {
318 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; 312 TCP_SKB_CB(skb)->flags |= TCPHDR_ECE | TCPHDR_CWR;
319 tp->ecn_flags = TCP_ECN_OK; 313 tp->ecn_flags = TCP_ECN_OK;
320 } 314 }
321} 315}
322 316
323static __inline__ void 317static __inline__ void
324TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th) 318TCP_ECN_make_synack(struct request_sock *req, struct tcphdr *th)
325{ 319{
326 if (inet_rsk(req)->ecn_ok) 320 if (inet_rsk(req)->ecn_ok)
327 th->ece = 1; 321 th->ece = 1;
@@ -362,7 +356,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
362 skb->ip_summed = CHECKSUM_PARTIAL; 356 skb->ip_summed = CHECKSUM_PARTIAL;
363 skb->csum = 0; 357 skb->csum = 0;
364 358
365 TCP_SKB_CB(skb)->tcp_flags = flags; 359 TCP_SKB_CB(skb)->flags = flags;
366 TCP_SKB_CB(skb)->sacked = 0; 360 TCP_SKB_CB(skb)->sacked = 0;
367 361
368 skb_shinfo(skb)->gso_segs = 1; 362 skb_shinfo(skb)->gso_segs = 1;
@@ -375,7 +369,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
375 TCP_SKB_CB(skb)->end_seq = seq; 369 TCP_SKB_CB(skb)->end_seq = seq;
376} 370}
377 371
378static inline bool tcp_urg_mode(const struct tcp_sock *tp) 372static inline int tcp_urg_mode(const struct tcp_sock *tp)
379{ 373{
380 return tp->snd_una != tp->snd_up; 374 return tp->snd_una != tp->snd_up;
381} 375}
@@ -385,17 +379,15 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
385#define OPTION_MD5 (1 << 2) 379#define OPTION_MD5 (1 << 2)
386#define OPTION_WSCALE (1 << 3) 380#define OPTION_WSCALE (1 << 3)
387#define OPTION_COOKIE_EXTENSION (1 << 4) 381#define OPTION_COOKIE_EXTENSION (1 << 4)
388#define OPTION_FAST_OPEN_COOKIE (1 << 8)
389 382
390struct tcp_out_options { 383struct tcp_out_options {
391 u16 options; /* bit field of OPTION_* */ 384 u8 options; /* bit field of OPTION_* */
392 u16 mss; /* 0 to disable */
393 u8 ws; /* window scale, 0 to disable */ 385 u8 ws; /* window scale, 0 to disable */
394 u8 num_sack_blocks; /* number of SACK blocks to include */ 386 u8 num_sack_blocks; /* number of SACK blocks to include */
395 u8 hash_size; /* bytes in hash_location */ 387 u8 hash_size; /* bytes in hash_location */
396 __u8 *hash_location; /* temporary pointer, overloaded */ 388 u16 mss; /* 0 to disable */
397 __u32 tsval, tsecr; /* need to include OPTION_TS */ 389 __u32 tsval, tsecr; /* need to include OPTION_TS */
398 struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ 390 __u8 *hash_location; /* temporary pointer, overloaded */
399}; 391};
400 392
401/* The sysctl int routines are generic, so check consistency here. 393/* The sysctl int routines are generic, so check consistency here.
@@ -444,7 +436,7 @@ static u8 tcp_cookie_size_check(u8 desired)
444static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, 436static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
445 struct tcp_out_options *opts) 437 struct tcp_out_options *opts)
446{ 438{
447 u16 options = opts->options; /* mungable copy */ 439 u8 options = opts->options; /* mungable copy */
448 440
449 /* Having both authentication and cookies for security is redundant, 441 /* Having both authentication and cookies for security is redundant,
450 * and there's certainly not enough room. Instead, the cookie-less 442 * and there's certainly not enough room. Instead, the cookie-less
@@ -566,37 +558,20 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
566 558
567 tp->rx_opt.dsack = 0; 559 tp->rx_opt.dsack = 0;
568 } 560 }
569
570 if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
571 struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
572
573 *ptr++ = htonl((TCPOPT_EXP << 24) |
574 ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) |
575 TCPOPT_FASTOPEN_MAGIC);
576
577 memcpy(ptr, foc->val, foc->len);
578 if ((foc->len & 3) == 2) {
579 u8 *align = ((u8 *)ptr) + foc->len;
580 align[0] = align[1] = TCPOPT_NOP;
581 }
582 ptr += (foc->len + 3) >> 2;
583 }
584} 561}
585 562
586/* Compute TCP options for SYN packets. This is not the final 563/* Compute TCP options for SYN packets. This is not the final
587 * network wire format yet. 564 * network wire format yet.
588 */ 565 */
589static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, 566static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
590 struct tcp_out_options *opts, 567 struct tcp_out_options *opts,
591 struct tcp_md5sig_key **md5) 568 struct tcp_md5sig_key **md5) {
592{
593 struct tcp_sock *tp = tcp_sk(sk); 569 struct tcp_sock *tp = tcp_sk(sk);
594 struct tcp_cookie_values *cvp = tp->cookie_values; 570 struct tcp_cookie_values *cvp = tp->cookie_values;
595 unsigned int remaining = MAX_TCP_OPTION_SPACE; 571 unsigned remaining = MAX_TCP_OPTION_SPACE;
596 u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ? 572 u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ?
597 tcp_cookie_size_check(cvp->cookie_desired) : 573 tcp_cookie_size_check(cvp->cookie_desired) :
598 0; 574 0;
599 struct tcp_fastopen_request *fastopen = tp->fastopen_req;
600 575
601#ifdef CONFIG_TCP_MD5SIG 576#ifdef CONFIG_TCP_MD5SIG
602 *md5 = tp->af_specific->md5_lookup(sk, sk); 577 *md5 = tp->af_specific->md5_lookup(sk, sk);
@@ -637,16 +612,6 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
637 remaining -= TCPOLEN_SACKPERM_ALIGNED; 612 remaining -= TCPOLEN_SACKPERM_ALIGNED;
638 } 613 }
639 614
640 if (fastopen && fastopen->cookie.len >= 0) {
641 u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len;
642 need = (need + 3) & ~3U; /* Align to 32 bits */
643 if (remaining >= need) {
644 opts->options |= OPTION_FAST_OPEN_COOKIE;
645 opts->fastopen_cookie = &fastopen->cookie;
646 remaining -= need;
647 tp->syn_fastopen = 1;
648 }
649 }
650 /* Note that timestamps are required by the specification. 615 /* Note that timestamps are required by the specification.
651 * 616 *
652 * Odd numbers of bytes are prohibited by the specification, ensuring 617 * Odd numbers of bytes are prohibited by the specification, ensuring
@@ -697,16 +662,15 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
697} 662}
698 663
699/* Set up TCP options for SYN-ACKs. */ 664/* Set up TCP options for SYN-ACKs. */
700static unsigned int tcp_synack_options(struct sock *sk, 665static unsigned tcp_synack_options(struct sock *sk,
701 struct request_sock *req, 666 struct request_sock *req,
702 unsigned int mss, struct sk_buff *skb, 667 unsigned mss, struct sk_buff *skb,
703 struct tcp_out_options *opts, 668 struct tcp_out_options *opts,
704 struct tcp_md5sig_key **md5, 669 struct tcp_md5sig_key **md5,
705 struct tcp_extend_values *xvp, 670 struct tcp_extend_values *xvp)
706 struct tcp_fastopen_cookie *foc)
707{ 671{
708 struct inet_request_sock *ireq = inet_rsk(req); 672 struct inet_request_sock *ireq = inet_rsk(req);
709 unsigned int remaining = MAX_TCP_OPTION_SPACE; 673 unsigned remaining = MAX_TCP_OPTION_SPACE;
710 u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ? 674 u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ?
711 xvp->cookie_plus : 675 xvp->cookie_plus :
712 0; 676 0;
@@ -748,15 +712,7 @@ static unsigned int tcp_synack_options(struct sock *sk,
748 if (unlikely(!ireq->tstamp_ok)) 712 if (unlikely(!ireq->tstamp_ok))
749 remaining -= TCPOLEN_SACKPERM_ALIGNED; 713 remaining -= TCPOLEN_SACKPERM_ALIGNED;
750 } 714 }
751 if (foc != NULL) { 715
752 u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
753 need = (need + 3) & ~3U; /* Align to 32 bits */
754 if (remaining >= need) {
755 opts->options |= OPTION_FAST_OPEN_COOKIE;
756 opts->fastopen_cookie = foc;
757 remaining -= need;
758 }
759 }
760 /* Similar rationale to tcp_syn_options() applies here, too. 716 /* Similar rationale to tcp_syn_options() applies here, too.
761 * If the <SYN> options fit, the same options should fit now! 717 * If the <SYN> options fit, the same options should fit now!
762 */ 718 */
@@ -785,13 +741,12 @@ static unsigned int tcp_synack_options(struct sock *sk,
785/* Compute TCP options for ESTABLISHED sockets. This is not the 741/* Compute TCP options for ESTABLISHED sockets. This is not the
786 * final wire format yet. 742 * final wire format yet.
787 */ 743 */
788static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb, 744static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,
789 struct tcp_out_options *opts, 745 struct tcp_out_options *opts,
790 struct tcp_md5sig_key **md5) 746 struct tcp_md5sig_key **md5) {
791{
792 struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; 747 struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
793 struct tcp_sock *tp = tcp_sk(sk); 748 struct tcp_sock *tp = tcp_sk(sk);
794 unsigned int size = 0; 749 unsigned size = 0;
795 unsigned int eff_sacks; 750 unsigned int eff_sacks;
796 751
797#ifdef CONFIG_TCP_MD5SIG 752#ifdef CONFIG_TCP_MD5SIG
@@ -813,9 +768,9 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
813 768
814 eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; 769 eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
815 if (unlikely(eff_sacks)) { 770 if (unlikely(eff_sacks)) {
816 const unsigned int remaining = MAX_TCP_OPTION_SPACE - size; 771 const unsigned remaining = MAX_TCP_OPTION_SPACE - size;
817 opts->num_sack_blocks = 772 opts->num_sack_blocks =
818 min_t(unsigned int, eff_sacks, 773 min_t(unsigned, eff_sacks,
819 (remaining - TCPOLEN_SACK_BASE_ALIGNED) / 774 (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
820 TCPOLEN_SACK_PERBLOCK); 775 TCPOLEN_SACK_PERBLOCK);
821 size += TCPOLEN_SACK_BASE_ALIGNED + 776 size += TCPOLEN_SACK_BASE_ALIGNED +
@@ -825,160 +780,6 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
825 return size; 780 return size;
826} 781}
827 782
828
829/* TCP SMALL QUEUES (TSQ)
830 *
831 * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
832 * to reduce RTT and bufferbloat.
833 * We do this using a special skb destructor (tcp_wfree).
834 *
835 * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
836 * needs to be reallocated in a driver.
837 * The invariant being skb->truesize substracted from sk->sk_wmem_alloc
838 *
839 * Since transmit from skb destructor is forbidden, we use a tasklet
840 * to process all sockets that eventually need to send more skbs.
841 * We use one tasklet per cpu, with its own queue of sockets.
842 */
843struct tsq_tasklet {
844 struct tasklet_struct tasklet;
845 struct list_head head; /* queue of tcp sockets */
846};
847static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
848
849static void tcp_tsq_handler(struct sock *sk)
850{
851 if ((1 << sk->sk_state) &
852 (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
853 TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
854 tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC);
855}
856/*
857 * One tasklest per cpu tries to send more skbs.
858 * We run in tasklet context but need to disable irqs when
859 * transfering tsq->head because tcp_wfree() might
860 * interrupt us (non NAPI drivers)
861 */
862static void tcp_tasklet_func(unsigned long data)
863{
864 struct tsq_tasklet *tsq = (struct tsq_tasklet *)data;
865 LIST_HEAD(list);
866 unsigned long flags;
867 struct list_head *q, *n;
868 struct tcp_sock *tp;
869 struct sock *sk;
870
871 local_irq_save(flags);
872 list_splice_init(&tsq->head, &list);
873 local_irq_restore(flags);
874
875 list_for_each_safe(q, n, &list) {
876 tp = list_entry(q, struct tcp_sock, tsq_node);
877 list_del(&tp->tsq_node);
878
879 sk = (struct sock *)tp;
880 bh_lock_sock(sk);
881
882 if (!sock_owned_by_user(sk)) {
883 tcp_tsq_handler(sk);
884 } else {
885 /* defer the work to tcp_release_cb() */
886 set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
887 }
888 bh_unlock_sock(sk);
889
890 clear_bit(TSQ_QUEUED, &tp->tsq_flags);
891 sk_free(sk);
892 }
893}
894
895#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \
896 (1UL << TCP_WRITE_TIMER_DEFERRED) | \
897 (1UL << TCP_DELACK_TIMER_DEFERRED) | \
898 (1UL << TCP_MTU_REDUCED_DEFERRED))
899/**
900 * tcp_release_cb - tcp release_sock() callback
901 * @sk: socket
902 *
903 * called from release_sock() to perform protocol dependent
904 * actions before socket release.
905 */
906void tcp_release_cb(struct sock *sk)
907{
908 struct tcp_sock *tp = tcp_sk(sk);
909 unsigned long flags, nflags;
910
911 /* perform an atomic operation only if at least one flag is set */
912 do {
913 flags = tp->tsq_flags;
914 if (!(flags & TCP_DEFERRED_ALL))
915 return;
916 nflags = flags & ~TCP_DEFERRED_ALL;
917 } while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags);
918
919 if (flags & (1UL << TCP_TSQ_DEFERRED))
920 tcp_tsq_handler(sk);
921
922 if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {
923 tcp_write_timer_handler(sk);
924 __sock_put(sk);
925 }
926 if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) {
927 tcp_delack_timer_handler(sk);
928 __sock_put(sk);
929 }
930 if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
931 sk->sk_prot->mtu_reduced(sk);
932 __sock_put(sk);
933 }
934}
935EXPORT_SYMBOL(tcp_release_cb);
936
937void __init tcp_tasklet_init(void)
938{
939 int i;
940
941 for_each_possible_cpu(i) {
942 struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
943
944 INIT_LIST_HEAD(&tsq->head);
945 tasklet_init(&tsq->tasklet,
946 tcp_tasklet_func,
947 (unsigned long)tsq);
948 }
949}
950
951/*
952 * Write buffer destructor automatically called from kfree_skb.
953 * We cant xmit new skbs from this context, as we might already
954 * hold qdisc lock.
955 */
956static void tcp_wfree(struct sk_buff *skb)
957{
958 struct sock *sk = skb->sk;
959 struct tcp_sock *tp = tcp_sk(sk);
960
961 if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
962 !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
963 unsigned long flags;
964 struct tsq_tasklet *tsq;
965
966 /* Keep a ref on socket.
967 * This last ref will be released in tcp_tasklet_func()
968 */
969 atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc);
970
971 /* queue this socket to tasklet queue */
972 local_irq_save(flags);
973 tsq = &__get_cpu_var(tsq_tasklet);
974 list_add(&tp->tsq_node, &tsq->head);
975 tasklet_schedule(&tsq->tasklet);
976 local_irq_restore(flags);
977 } else {
978 sock_wfree(skb);
979 }
980}
981
982/* This routine actually transmits TCP packets queued in by 783/* This routine actually transmits TCP packets queued in by
983 * tcp_do_sendmsg(). This is used by both the initial 784 * tcp_do_sendmsg(). This is used by both the initial
984 * transmission and possible later retransmissions. 785 * transmission and possible later retransmissions.
@@ -998,7 +799,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
998 struct tcp_sock *tp; 799 struct tcp_sock *tp;
999 struct tcp_skb_cb *tcb; 800 struct tcp_skb_cb *tcb;
1000 struct tcp_out_options opts; 801 struct tcp_out_options opts;
1001 unsigned int tcp_options_size, tcp_header_size; 802 unsigned tcp_options_size, tcp_header_size;
1002 struct tcp_md5sig_key *md5; 803 struct tcp_md5sig_key *md5;
1003 struct tcphdr *th; 804 struct tcphdr *th;
1004 int err; 805 int err;
@@ -1025,7 +826,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
1025 tcb = TCP_SKB_CB(skb); 826 tcb = TCP_SKB_CB(skb);
1026 memset(&opts, 0, sizeof(opts)); 827 memset(&opts, 0, sizeof(opts));
1027 828
1028 if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) 829 if (unlikely(tcb->flags & TCPHDR_SYN))
1029 tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5); 830 tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
1030 else 831 else
1031 tcp_options_size = tcp_established_options(sk, skb, &opts, 832 tcp_options_size = tcp_established_options(sk, skb, &opts,
@@ -1040,12 +841,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
1040 841
1041 skb_push(skb, tcp_header_size); 842 skb_push(skb, tcp_header_size);
1042 skb_reset_transport_header(skb); 843 skb_reset_transport_header(skb);
1043 844 skb_set_owner_w(skb, sk);
1044 skb_orphan(skb);
1045 skb->sk = sk;
1046 skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
1047 tcp_wfree : sock_wfree;
1048 atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1049 845
1050 /* Build TCP header and checksum it. */ 846 /* Build TCP header and checksum it. */
1051 th = tcp_hdr(skb); 847 th = tcp_hdr(skb);
@@ -1054,9 +850,9 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
1054 th->seq = htonl(tcb->seq); 850 th->seq = htonl(tcb->seq);
1055 th->ack_seq = htonl(tp->rcv_nxt); 851 th->ack_seq = htonl(tp->rcv_nxt);
1056 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | 852 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
1057 tcb->tcp_flags); 853 tcb->flags);
1058 854
1059 if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) { 855 if (unlikely(tcb->flags & TCPHDR_SYN)) {
1060 /* RFC1323: The window in SYN & SYN/ACK segments 856 /* RFC1323: The window in SYN & SYN/ACK segments
1061 * is never scaled. 857 * is never scaled.
1062 */ 858 */
@@ -1079,7 +875,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
1079 } 875 }
1080 876
1081 tcp_options_write((__be32 *)(th + 1), tp, &opts); 877 tcp_options_write((__be32 *)(th + 1), tp, &opts);
1082 if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0)) 878 if (likely((tcb->flags & TCPHDR_SYN) == 0))
1083 TCP_ECN_send(sk, skb, tcp_header_size); 879 TCP_ECN_send(sk, skb, tcp_header_size);
1084 880
1085#ifdef CONFIG_TCP_MD5SIG 881#ifdef CONFIG_TCP_MD5SIG
@@ -1093,11 +889,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
1093 889
1094 icsk->icsk_af_ops->send_check(sk, skb); 890 icsk->icsk_af_ops->send_check(sk, skb);
1095 891
1096 if (likely(tcb->tcp_flags & TCPHDR_ACK)) 892 if (likely(tcb->flags & TCPHDR_ACK))
1097 tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); 893 tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
1098 894
1099 if (skb->len != tcp_header_size) 895 if (skb->len != tcp_header_size)
1100 tcp_event_data_sent(tp, sk); 896 tcp_event_data_sent(tp, skb, sk);
1101 897
1102 if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) 898 if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
1103 TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, 899 TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
@@ -1130,7 +926,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
1130} 926}
1131 927
1132/* Initialize TSO segments for a packet. */ 928/* Initialize TSO segments for a packet. */
1133static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb, 929static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb,
1134 unsigned int mss_now) 930 unsigned int mss_now)
1135{ 931{
1136 if (skb->len <= mss_now || !sk_can_gso(sk) || 932 if (skb->len <= mss_now || !sk_can_gso(sk) ||
@@ -1151,7 +947,7 @@ static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
1151/* When a modification to fackets out becomes necessary, we need to check 947/* When a modification to fackets out becomes necessary, we need to check
1152 * skb is counted to fackets_out or not. 948 * skb is counted to fackets_out or not.
1153 */ 949 */
1154static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb, 950static void tcp_adjust_fackets_out(struct sock *sk, struct sk_buff *skb,
1155 int decr) 951 int decr)
1156{ 952{
1157 struct tcp_sock *tp = tcp_sk(sk); 953 struct tcp_sock *tp = tcp_sk(sk);
@@ -1166,7 +962,7 @@ static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb,
1166/* Pcount in the middle of the write queue got changed, we need to do various 962/* Pcount in the middle of the write queue got changed, we need to do various
1167 * tweaks to fix counters 963 * tweaks to fix counters
1168 */ 964 */
1169static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr) 965static void tcp_adjust_pcount(struct sock *sk, struct sk_buff *skb, int decr)
1170{ 966{
1171 struct tcp_sock *tp = tcp_sk(sk); 967 struct tcp_sock *tp = tcp_sk(sk);
1172 968
@@ -1236,9 +1032,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
1236 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; 1032 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
1237 1033
1238 /* PSH and FIN should only be set in the second packet. */ 1034 /* PSH and FIN should only be set in the second packet. */
1239 flags = TCP_SKB_CB(skb)->tcp_flags; 1035 flags = TCP_SKB_CB(skb)->flags;
1240 TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); 1036 TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
1241 TCP_SKB_CB(buff)->tcp_flags = flags; 1037 TCP_SKB_CB(buff)->flags = flags;
1242 TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; 1038 TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
1243 1039
1244 if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) { 1040 if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
@@ -1295,27 +1091,17 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)
1295{ 1091{
1296 int i, k, eat; 1092 int i, k, eat;
1297 1093
1298 eat = min_t(int, len, skb_headlen(skb));
1299 if (eat) {
1300 __skb_pull(skb, eat);
1301 skb->avail_size -= eat;
1302 len -= eat;
1303 if (!len)
1304 return;
1305 }
1306 eat = len; 1094 eat = len;
1307 k = 0; 1095 k = 0;
1308 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1096 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1309 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 1097 if (skb_shinfo(skb)->frags[i].size <= eat) {
1310 1098 put_page(skb_shinfo(skb)->frags[i].page);
1311 if (size <= eat) { 1099 eat -= skb_shinfo(skb)->frags[i].size;
1312 skb_frag_unref(skb, i);
1313 eat -= size;
1314 } else { 1100 } else {
1315 skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; 1101 skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
1316 if (eat) { 1102 if (eat) {
1317 skb_shinfo(skb)->frags[k].page_offset += eat; 1103 skb_shinfo(skb)->frags[k].page_offset += eat;
1318 skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat); 1104 skb_shinfo(skb)->frags[k].size -= eat;
1319 eat = 0; 1105 eat = 0;
1320 } 1106 }
1321 k++; 1107 k++;
@@ -1334,7 +1120,11 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
1334 if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) 1120 if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
1335 return -ENOMEM; 1121 return -ENOMEM;
1336 1122
1337 __pskb_trim_head(skb, len); 1123 /* If len == headlen, we avoid __skb_pull to preserve alignment. */
1124 if (unlikely(len < skb_headlen(skb)))
1125 __skb_pull(skb, len);
1126 else
1127 __pskb_trim_head(skb, len - skb_headlen(skb));
1338 1128
1339 TCP_SKB_CB(skb)->seq += len; 1129 TCP_SKB_CB(skb)->seq += len;
1340 skb->ip_summed = CHECKSUM_PARTIAL; 1130 skb->ip_summed = CHECKSUM_PARTIAL;
@@ -1354,8 +1144,8 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
1354/* Calculate MSS. Not accounting for SACKs here. */ 1144/* Calculate MSS. Not accounting for SACKs here. */
1355int tcp_mtu_to_mss(struct sock *sk, int pmtu) 1145int tcp_mtu_to_mss(struct sock *sk, int pmtu)
1356{ 1146{
1357 const struct tcp_sock *tp = tcp_sk(sk); 1147 struct tcp_sock *tp = tcp_sk(sk);
1358 const struct inet_connection_sock *icsk = inet_csk(sk); 1148 struct inet_connection_sock *icsk = inet_csk(sk);
1359 int mss_now; 1149 int mss_now;
1360 1150
1361 /* Calculate base mss without TCP options: 1151 /* Calculate base mss without TCP options:
@@ -1363,14 +1153,6 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu)
1363 */ 1153 */
1364 mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr); 1154 mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
1365 1155
1366 /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
1367 if (icsk->icsk_af_ops->net_frag_header_len) {
1368 const struct dst_entry *dst = __sk_dst_get(sk);
1369
1370 if (dst && dst_allfrag(dst))
1371 mss_now -= icsk->icsk_af_ops->net_frag_header_len;
1372 }
1373
1374 /* Clamp it (mss_clamp does not include tcp options) */ 1156 /* Clamp it (mss_clamp does not include tcp options) */
1375 if (mss_now > tp->rx_opt.mss_clamp) 1157 if (mss_now > tp->rx_opt.mss_clamp)
1376 mss_now = tp->rx_opt.mss_clamp; 1158 mss_now = tp->rx_opt.mss_clamp;
@@ -1391,8 +1173,8 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu)
1391/* Inverse of above */ 1173/* Inverse of above */
1392int tcp_mss_to_mtu(struct sock *sk, int mss) 1174int tcp_mss_to_mtu(struct sock *sk, int mss)
1393{ 1175{
1394 const struct tcp_sock *tp = tcp_sk(sk); 1176 struct tcp_sock *tp = tcp_sk(sk);
1395 const struct inet_connection_sock *icsk = inet_csk(sk); 1177 struct inet_connection_sock *icsk = inet_csk(sk);
1396 int mtu; 1178 int mtu;
1397 1179
1398 mtu = mss + 1180 mtu = mss +
@@ -1400,13 +1182,6 @@ int tcp_mss_to_mtu(struct sock *sk, int mss)
1400 icsk->icsk_ext_hdr_len + 1182 icsk->icsk_ext_hdr_len +
1401 icsk->icsk_af_ops->net_header_len; 1183 icsk->icsk_af_ops->net_header_len;
1402 1184
1403 /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
1404 if (icsk->icsk_af_ops->net_frag_header_len) {
1405 const struct dst_entry *dst = __sk_dst_get(sk);
1406
1407 if (dst && dst_allfrag(dst))
1408 mtu += icsk->icsk_af_ops->net_frag_header_len;
1409 }
1410 return mtu; 1185 return mtu;
1411} 1186}
1412 1187
@@ -1473,10 +1248,10 @@ EXPORT_SYMBOL(tcp_sync_mss);
1473 */ 1248 */
1474unsigned int tcp_current_mss(struct sock *sk) 1249unsigned int tcp_current_mss(struct sock *sk)
1475{ 1250{
1476 const struct tcp_sock *tp = tcp_sk(sk); 1251 struct tcp_sock *tp = tcp_sk(sk);
1477 const struct dst_entry *dst = __sk_dst_get(sk); 1252 struct dst_entry *dst = __sk_dst_get(sk);
1478 u32 mss_now; 1253 u32 mss_now;
1479 unsigned int header_len; 1254 unsigned header_len;
1480 struct tcp_out_options opts; 1255 struct tcp_out_options opts;
1481 struct tcp_md5sig_key *md5; 1256 struct tcp_md5sig_key *md5;
1482 1257
@@ -1534,22 +1309,22 @@ static void tcp_cwnd_validate(struct sock *sk)
1534 * modulo only when the receiver window alone is the limiting factor or 1309 * modulo only when the receiver window alone is the limiting factor or
1535 * when we would be allowed to send the split-due-to-Nagle skb fully. 1310 * when we would be allowed to send the split-due-to-Nagle skb fully.
1536 */ 1311 */
1537static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, 1312static unsigned int tcp_mss_split_point(struct sock *sk, struct sk_buff *skb,
1538 unsigned int mss_now, unsigned int max_segs) 1313 unsigned int mss_now, unsigned int cwnd)
1539{ 1314{
1540 const struct tcp_sock *tp = tcp_sk(sk); 1315 struct tcp_sock *tp = tcp_sk(sk);
1541 u32 needed, window, max_len; 1316 u32 needed, window, cwnd_len;
1542 1317
1543 window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; 1318 window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
1544 max_len = mss_now * max_segs; 1319 cwnd_len = mss_now * cwnd;
1545 1320
1546 if (likely(max_len <= window && skb != tcp_write_queue_tail(sk))) 1321 if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk)))
1547 return max_len; 1322 return cwnd_len;
1548 1323
1549 needed = min(skb->len, window); 1324 needed = min(skb->len, window);
1550 1325
1551 if (max_len <= needed) 1326 if (cwnd_len <= needed)
1552 return max_len; 1327 return cwnd_len;
1553 1328
1554 return needed - needed % mss_now; 1329 return needed - needed % mss_now;
1555} 1330}
@@ -1557,14 +1332,13 @@ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_b
1557/* Can at least one segment of SKB be sent right now, according to the 1332/* Can at least one segment of SKB be sent right now, according to the
1558 * congestion window rules? If so, return how many segments are allowed. 1333 * congestion window rules? If so, return how many segments are allowed.
1559 */ 1334 */
1560static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, 1335static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp,
1561 const struct sk_buff *skb) 1336 struct sk_buff *skb)
1562{ 1337{
1563 u32 in_flight, cwnd; 1338 u32 in_flight, cwnd;
1564 1339
1565 /* Don't be strict about the congestion window for the final FIN. */ 1340 /* Don't be strict about the congestion window for the final FIN. */
1566 if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && 1341 if ((TCP_SKB_CB(skb)->flags & TCPHDR_FIN) && tcp_skb_pcount(skb) == 1)
1567 tcp_skb_pcount(skb) == 1)
1568 return 1; 1342 return 1;
1569 1343
1570 in_flight = tcp_packets_in_flight(tp); 1344 in_flight = tcp_packets_in_flight(tp);
@@ -1579,7 +1353,7 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
1579 * This must be invoked the first time we consider transmitting 1353 * This must be invoked the first time we consider transmitting
1580 * SKB onto the wire. 1354 * SKB onto the wire.
1581 */ 1355 */
1582static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb, 1356static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb,
1583 unsigned int mss_now) 1357 unsigned int mss_now)
1584{ 1358{
1585 int tso_segs = tcp_skb_pcount(skb); 1359 int tso_segs = tcp_skb_pcount(skb);
@@ -1592,33 +1366,33 @@ static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
1592} 1366}
1593 1367
1594/* Minshall's variant of the Nagle send check. */ 1368/* Minshall's variant of the Nagle send check. */
1595static inline bool tcp_minshall_check(const struct tcp_sock *tp) 1369static inline int tcp_minshall_check(const struct tcp_sock *tp)
1596{ 1370{
1597 return after(tp->snd_sml, tp->snd_una) && 1371 return after(tp->snd_sml, tp->snd_una) &&
1598 !after(tp->snd_sml, tp->snd_nxt); 1372 !after(tp->snd_sml, tp->snd_nxt);
1599} 1373}
1600 1374
1601/* Return false, if packet can be sent now without violation Nagle's rules: 1375/* Return 0, if packet can be sent now without violation Nagle's rules:
1602 * 1. It is full sized. 1376 * 1. It is full sized.
1603 * 2. Or it contains FIN. (already checked by caller) 1377 * 2. Or it contains FIN. (already checked by caller)
1604 * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. 1378 * 3. Or TCP_NODELAY was set.
1605 * 4. Or TCP_CORK is not set, and all sent packets are ACKed. 1379 * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
1606 * With Minshall's modification: all sent small packets are ACKed. 1380 * With Minshall's modification: all sent small packets are ACKed.
1607 */ 1381 */
1608static inline bool tcp_nagle_check(const struct tcp_sock *tp, 1382static inline int tcp_nagle_check(const struct tcp_sock *tp,
1609 const struct sk_buff *skb, 1383 const struct sk_buff *skb,
1610 unsigned int mss_now, int nonagle) 1384 unsigned mss_now, int nonagle)
1611{ 1385{
1612 return skb->len < mss_now && 1386 return skb->len < mss_now &&
1613 ((nonagle & TCP_NAGLE_CORK) || 1387 ((nonagle & TCP_NAGLE_CORK) ||
1614 (!nonagle && tp->packets_out && tcp_minshall_check(tp))); 1388 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
1615} 1389}
1616 1390
1617/* Return true if the Nagle test allows this packet to be 1391/* Return non-zero if the Nagle test allows this packet to be
1618 * sent now. 1392 * sent now.
1619 */ 1393 */
1620static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, 1394static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
1621 unsigned int cur_mss, int nonagle) 1395 unsigned int cur_mss, int nonagle)
1622{ 1396{
1623 /* Nagle rule does not apply to frames, which sit in the middle of the 1397 /* Nagle rule does not apply to frames, which sit in the middle of the
1624 * write_queue (they have no chances to get new data). 1398 * write_queue (they have no chances to get new data).
@@ -1627,25 +1401,24 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf
1627 * argument based upon the location of SKB in the send queue. 1401 * argument based upon the location of SKB in the send queue.
1628 */ 1402 */
1629 if (nonagle & TCP_NAGLE_PUSH) 1403 if (nonagle & TCP_NAGLE_PUSH)
1630 return true; 1404 return 1;
1631 1405
1632 /* Don't use the nagle rule for urgent data (or for the final FIN). 1406 /* Don't use the nagle rule for urgent data (or for the final FIN).
1633 * Nagle can be ignored during F-RTO too (see RFC4138). 1407 * Nagle can be ignored during F-RTO too (see RFC4138).
1634 */ 1408 */
1635 if (tcp_urg_mode(tp) || (tp->frto_counter == 2) || 1409 if (tcp_urg_mode(tp) || (tp->frto_counter == 2) ||
1636 (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) 1410 (TCP_SKB_CB(skb)->flags & TCPHDR_FIN))
1637 return true; 1411 return 1;
1638 1412
1639 if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) 1413 if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
1640 return true; 1414 return 1;
1641 1415
1642 return false; 1416 return 0;
1643} 1417}
1644 1418
1645/* Does at least the first segment of SKB fit into the send window? */ 1419/* Does at least the first segment of SKB fit into the send window? */
1646static bool tcp_snd_wnd_test(const struct tcp_sock *tp, 1420static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb,
1647 const struct sk_buff *skb, 1421 unsigned int cur_mss)
1648 unsigned int cur_mss)
1649{ 1422{
1650 u32 end_seq = TCP_SKB_CB(skb)->end_seq; 1423 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
1651 1424
@@ -1659,10 +1432,10 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
1659 * should be put on the wire right now. If so, it returns the number of 1432 * should be put on the wire right now. If so, it returns the number of
1660 * packets allowed by the congestion window. 1433 * packets allowed by the congestion window.
1661 */ 1434 */
1662static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb, 1435static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
1663 unsigned int cur_mss, int nonagle) 1436 unsigned int cur_mss, int nonagle)
1664{ 1437{
1665 const struct tcp_sock *tp = tcp_sk(sk); 1438 struct tcp_sock *tp = tcp_sk(sk);
1666 unsigned int cwnd_quota; 1439 unsigned int cwnd_quota;
1667 1440
1668 tcp_init_tso_segs(sk, skb, cur_mss); 1441 tcp_init_tso_segs(sk, skb, cur_mss);
@@ -1678,9 +1451,9 @@ static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb,
1678} 1451}
1679 1452
1680/* Test if sending is allowed right now. */ 1453/* Test if sending is allowed right now. */
1681bool tcp_may_send_now(struct sock *sk) 1454int tcp_may_send_now(struct sock *sk)
1682{ 1455{
1683 const struct tcp_sock *tp = tcp_sk(sk); 1456 struct tcp_sock *tp = tcp_sk(sk);
1684 struct sk_buff *skb = tcp_send_head(sk); 1457 struct sk_buff *skb = tcp_send_head(sk);
1685 1458
1686 return skb && 1459 return skb &&
@@ -1722,9 +1495,9 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1722 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; 1495 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
1723 1496
1724 /* PSH and FIN should only be set in the second packet. */ 1497 /* PSH and FIN should only be set in the second packet. */
1725 flags = TCP_SKB_CB(skb)->tcp_flags; 1498 flags = TCP_SKB_CB(skb)->flags;
1726 TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); 1499 TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
1727 TCP_SKB_CB(buff)->tcp_flags = flags; 1500 TCP_SKB_CB(buff)->flags = flags;
1728 1501
1729 /* This packet was never sent out yet, so no SACK bits. */ 1502 /* This packet was never sent out yet, so no SACK bits. */
1730 TCP_SKB_CB(buff)->sacked = 0; 1503 TCP_SKB_CB(buff)->sacked = 0;
@@ -1748,14 +1521,14 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1748 * 1521 *
1749 * This algorithm is from John Heffner. 1522 * This algorithm is from John Heffner.
1750 */ 1523 */
1751static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) 1524static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1752{ 1525{
1753 struct tcp_sock *tp = tcp_sk(sk); 1526 struct tcp_sock *tp = tcp_sk(sk);
1754 const struct inet_connection_sock *icsk = inet_csk(sk); 1527 const struct inet_connection_sock *icsk = inet_csk(sk);
1755 u32 send_win, cong_win, limit, in_flight; 1528 u32 send_win, cong_win, limit, in_flight;
1756 int win_divisor; 1529 int win_divisor;
1757 1530
1758 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) 1531 if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN)
1759 goto send_now; 1532 goto send_now;
1760 1533
1761 if (icsk->icsk_ca_state != TCP_CA_Open) 1534 if (icsk->icsk_ca_state != TCP_CA_Open)
@@ -1778,8 +1551,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1778 limit = min(send_win, cong_win); 1551 limit = min(send_win, cong_win);
1779 1552
1780 /* If a full-sized TSO skb can be sent, do it. */ 1553 /* If a full-sized TSO skb can be sent, do it. */
1781 if (limit >= min_t(unsigned int, sk->sk_gso_max_size, 1554 if (limit >= sk->sk_gso_max_size)
1782 sk->sk_gso_max_segs * tp->mss_cache))
1783 goto send_now; 1555 goto send_now;
1784 1556
1785 /* Middle in queue won't get any more data, full sendable already? */ 1557 /* Middle in queue won't get any more data, full sendable already? */
@@ -1802,18 +1574,18 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1802 * frame, so if we have space for more than 3 frames 1574 * frame, so if we have space for more than 3 frames
1803 * then send now. 1575 * then send now.
1804 */ 1576 */
1805 if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache) 1577 if (limit > tcp_max_burst(tp) * tp->mss_cache)
1806 goto send_now; 1578 goto send_now;
1807 } 1579 }
1808 1580
1809 /* Ok, it looks like it is advisable to defer. */ 1581 /* Ok, it looks like it is advisable to defer. */
1810 tp->tso_deferred = 1 | (jiffies << 1); 1582 tp->tso_deferred = 1 | (jiffies << 1);
1811 1583
1812 return true; 1584 return 1;
1813 1585
1814send_now: 1586send_now:
1815 tp->tso_deferred = 0; 1587 tp->tso_deferred = 0;
1816 return false; 1588 return 0;
1817} 1589}
1818 1590
1819/* Create a new MTU probe if we are ready. 1591/* Create a new MTU probe if we are ready.
@@ -1883,7 +1655,7 @@ static int tcp_mtu_probe(struct sock *sk)
1883 1655
1884 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; 1656 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
1885 TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; 1657 TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
1886 TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK; 1658 TCP_SKB_CB(nskb)->flags = TCPHDR_ACK;
1887 TCP_SKB_CB(nskb)->sacked = 0; 1659 TCP_SKB_CB(nskb)->sacked = 0;
1888 nskb->csum = 0; 1660 nskb->csum = 0;
1889 nskb->ip_summed = skb->ip_summed; 1661 nskb->ip_summed = skb->ip_summed;
@@ -1903,11 +1675,11 @@ static int tcp_mtu_probe(struct sock *sk)
1903 if (skb->len <= copy) { 1675 if (skb->len <= copy) {
1904 /* We've eaten all the data from this skb. 1676 /* We've eaten all the data from this skb.
1905 * Throw it away. */ 1677 * Throw it away. */
1906 TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 1678 TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
1907 tcp_unlink_write_queue(skb, sk); 1679 tcp_unlink_write_queue(skb, sk);
1908 sk_wmem_free_skb(sk, skb); 1680 sk_wmem_free_skb(sk, skb);
1909 } else { 1681 } else {
1910 TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags & 1682 TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
1911 ~(TCPHDR_FIN|TCPHDR_PSH); 1683 ~(TCPHDR_FIN|TCPHDR_PSH);
1912 if (!skb_shinfo(skb)->nr_frags) { 1684 if (!skb_shinfo(skb)->nr_frags) {
1913 skb_pull(skb, copy); 1685 skb_pull(skb, copy);
@@ -1955,11 +1727,11 @@ static int tcp_mtu_probe(struct sock *sk)
1955 * snd_up-64k-mss .. snd_up cannot be large. However, taking into 1727 * snd_up-64k-mss .. snd_up cannot be large. However, taking into
1956 * account rare use of URG, this is not a big flaw. 1728 * account rare use of URG, this is not a big flaw.
1957 * 1729 *
1958 * Returns true, if no segments are in flight and we have queued segments, 1730 * Returns 1, if no segments are in flight and we have queued segments, but
1959 * but cannot send anything now because of SWS or another problem. 1731 * cannot send anything now because of SWS or another problem.
1960 */ 1732 */
1961static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, 1733static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1962 int push_one, gfp_t gfp) 1734 int push_one, gfp_t gfp)
1963{ 1735{
1964 struct tcp_sock *tp = tcp_sk(sk); 1736 struct tcp_sock *tp = tcp_sk(sk);
1965 struct sk_buff *skb; 1737 struct sk_buff *skb;
@@ -1973,7 +1745,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1973 /* Do MTU probing. */ 1745 /* Do MTU probing. */
1974 result = tcp_mtu_probe(sk); 1746 result = tcp_mtu_probe(sk);
1975 if (!result) { 1747 if (!result) {
1976 return false; 1748 return 0;
1977 } else if (result > 0) { 1749 } else if (result > 0) {
1978 sent_pkts = 1; 1750 sent_pkts = 1;
1979 } 1751 }
@@ -1982,13 +1754,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1982 while ((skb = tcp_send_head(sk))) { 1754 while ((skb = tcp_send_head(sk))) {
1983 unsigned int limit; 1755 unsigned int limit;
1984 1756
1985
1986 tso_segs = tcp_init_tso_segs(sk, skb, mss_now); 1757 tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
1987 BUG_ON(!tso_segs); 1758 BUG_ON(!tso_segs);
1988 1759
1989 if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE)
1990 goto repair; /* Skip network transmission */
1991
1992 cwnd_quota = tcp_cwnd_test(tp, skb); 1760 cwnd_quota = tcp_cwnd_test(tp, skb);
1993 if (!cwnd_quota) 1761 if (!cwnd_quota)
1994 break; 1762 break;
@@ -2006,19 +1774,10 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2006 break; 1774 break;
2007 } 1775 }
2008 1776
2009 /* TSQ : sk_wmem_alloc accounts skb truesize,
2010 * including skb overhead. But thats OK.
2011 */
2012 if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) {
2013 set_bit(TSQ_THROTTLED, &tp->tsq_flags);
2014 break;
2015 }
2016 limit = mss_now; 1777 limit = mss_now;
2017 if (tso_segs > 1 && !tcp_urg_mode(tp)) 1778 if (tso_segs > 1 && !tcp_urg_mode(tp))
2018 limit = tcp_mss_split_point(sk, skb, mss_now, 1779 limit = tcp_mss_split_point(sk, skb, mss_now,
2019 min_t(unsigned int, 1780 cwnd_quota);
2020 cwnd_quota,
2021 sk->sk_gso_max_segs));
2022 1781
2023 if (skb->len > limit && 1782 if (skb->len > limit &&
2024 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) 1783 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
@@ -2029,24 +1788,21 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2029 if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) 1788 if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
2030 break; 1789 break;
2031 1790
2032repair:
2033 /* Advance the send_head. This one is sent out. 1791 /* Advance the send_head. This one is sent out.
2034 * This call will increment packets_out. 1792 * This call will increment packets_out.
2035 */ 1793 */
2036 tcp_event_new_data_sent(sk, skb); 1794 tcp_event_new_data_sent(sk, skb);
2037 1795
2038 tcp_minshall_update(tp, mss_now, skb); 1796 tcp_minshall_update(tp, mss_now, skb);
2039 sent_pkts += tcp_skb_pcount(skb); 1797 sent_pkts++;
2040 1798
2041 if (push_one) 1799 if (push_one)
2042 break; 1800 break;
2043 } 1801 }
2044 1802
2045 if (likely(sent_pkts)) { 1803 if (likely(sent_pkts)) {
2046 if (tcp_in_cwnd_reduction(sk))
2047 tp->prr_out += sent_pkts;
2048 tcp_cwnd_validate(sk); 1804 tcp_cwnd_validate(sk);
2049 return false; 1805 return 0;
2050 } 1806 }
2051 return !tp->packets_out && tcp_send_head(sk); 1807 return !tp->packets_out && tcp_send_head(sk);
2052} 1808}
@@ -2065,8 +1821,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
2065 if (unlikely(sk->sk_state == TCP_CLOSE)) 1821 if (unlikely(sk->sk_state == TCP_CLOSE))
2066 return; 1822 return;
2067 1823
2068 if (tcp_write_xmit(sk, cur_mss, nonagle, 0, 1824 if (tcp_write_xmit(sk, cur_mss, nonagle, 0, GFP_ATOMIC))
2069 sk_gfp_atomic(sk, GFP_ATOMIC)))
2070 tcp_check_probe_timer(sk); 1825 tcp_check_probe_timer(sk);
2071} 1826}
2072 1827
@@ -2155,7 +1910,7 @@ u32 __tcp_select_window(struct sock *sk)
2155 if (free_space < (full_space >> 1)) { 1910 if (free_space < (full_space >> 1)) {
2156 icsk->icsk_ack.quick = 0; 1911 icsk->icsk_ack.quick = 0;
2157 1912
2158 if (sk_under_memory_pressure(sk)) 1913 if (tcp_memory_pressure)
2159 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 1914 tp->rcv_ssthresh = min(tp->rcv_ssthresh,
2160 4U * tp->advmss); 1915 4U * tp->advmss);
2161 1916
@@ -2228,7 +1983,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
2228 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; 1983 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
2229 1984
2230 /* Merge over control information. This moves PSH/FIN etc. over */ 1985 /* Merge over control information. This moves PSH/FIN etc. over */
2231 TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags; 1986 TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(next_skb)->flags;
2232 1987
2233 /* All done, get rid of second SKB and account for it so 1988 /* All done, get rid of second SKB and account for it so
2234 * packet counting does not break. 1989 * packet counting does not break.
@@ -2246,22 +2001,22 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
2246} 2001}
2247 2002
2248/* Check if coalescing SKBs is legal. */ 2003/* Check if coalescing SKBs is legal. */
2249static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb) 2004static int tcp_can_collapse(struct sock *sk, struct sk_buff *skb)
2250{ 2005{
2251 if (tcp_skb_pcount(skb) > 1) 2006 if (tcp_skb_pcount(skb) > 1)
2252 return false; 2007 return 0;
2253 /* TODO: SACK collapsing could be used to remove this condition */ 2008 /* TODO: SACK collapsing could be used to remove this condition */
2254 if (skb_shinfo(skb)->nr_frags != 0) 2009 if (skb_shinfo(skb)->nr_frags != 0)
2255 return false; 2010 return 0;
2256 if (skb_cloned(skb)) 2011 if (skb_cloned(skb))
2257 return false; 2012 return 0;
2258 if (skb == tcp_send_head(sk)) 2013 if (skb == tcp_send_head(sk))
2259 return false; 2014 return 0;
2260 /* Some heurestics for collapsing over SACK'd could be invented */ 2015 /* Some heurestics for collapsing over SACK'd could be invented */
2261 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) 2016 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
2262 return false; 2017 return 0;
2263 2018
2264 return true; 2019 return 1;
2265} 2020}
2266 2021
2267/* Collapse packets in the retransmit queue to make to create 2022/* Collapse packets in the retransmit queue to make to create
@@ -2272,11 +2027,11 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
2272{ 2027{
2273 struct tcp_sock *tp = tcp_sk(sk); 2028 struct tcp_sock *tp = tcp_sk(sk);
2274 struct sk_buff *skb = to, *tmp; 2029 struct sk_buff *skb = to, *tmp;
2275 bool first = true; 2030 int first = 1;
2276 2031
2277 if (!sysctl_tcp_retrans_collapse) 2032 if (!sysctl_tcp_retrans_collapse)
2278 return; 2033 return;
2279 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) 2034 if (TCP_SKB_CB(skb)->flags & TCPHDR_SYN)
2280 return; 2035 return;
2281 2036
2282 tcp_for_write_queue_from_safe(skb, tmp, sk) { 2037 tcp_for_write_queue_from_safe(skb, tmp, sk) {
@@ -2286,7 +2041,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
2286 space -= skb->len; 2041 space -= skb->len;
2287 2042
2288 if (first) { 2043 if (first) {
2289 first = false; 2044 first = 0;
2290 continue; 2045 continue;
2291 } 2046 }
2292 2047
@@ -2295,7 +2050,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
2295 /* Punt if not enough space exists in the first SKB for 2050 /* Punt if not enough space exists in the first SKB for
2296 * the data in the second 2051 * the data in the second
2297 */ 2052 */
2298 if (skb->len > skb_availroom(to)) 2053 if (skb->len > skb_tailroom(to))
2299 break; 2054 break;
2300 2055
2301 if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp))) 2056 if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
@@ -2309,11 +2064,12 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
2309 * state updates are done by the caller. Returns non-zero if an 2064 * state updates are done by the caller. Returns non-zero if an
2310 * error occurred which prevented the send. 2065 * error occurred which prevented the send.
2311 */ 2066 */
2312int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) 2067int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2313{ 2068{
2314 struct tcp_sock *tp = tcp_sk(sk); 2069 struct tcp_sock *tp = tcp_sk(sk);
2315 struct inet_connection_sock *icsk = inet_csk(sk); 2070 struct inet_connection_sock *icsk = inet_csk(sk);
2316 unsigned int cur_mss; 2071 unsigned int cur_mss;
2072 int err;
2317 2073
2318 /* Inconslusive MTU probe */ 2074 /* Inconslusive MTU probe */
2319 if (icsk->icsk_mtup.probe_size) { 2075 if (icsk->icsk_mtup.probe_size) {
@@ -2367,12 +2123,12 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2367 * since it is cheap to do so and saves bytes on the network. 2123 * since it is cheap to do so and saves bytes on the network.
2368 */ 2124 */
2369 if (skb->len > 0 && 2125 if (skb->len > 0 &&
2370 (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) && 2126 (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) &&
2371 tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { 2127 tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
2372 if (!pskb_trim(skb, 0)) { 2128 if (!pskb_trim(skb, 0)) {
2373 /* Reuse, even though it does some unnecessary work */ 2129 /* Reuse, even though it does some unnecessary work */
2374 tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1, 2130 tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1,
2375 TCP_SKB_CB(skb)->tcp_flags); 2131 TCP_SKB_CB(skb)->flags);
2376 skb->ip_summed = CHECKSUM_NONE; 2132 skb->ip_summed = CHECKSUM_NONE;
2377 } 2133 }
2378 } 2134 }
@@ -2382,21 +2138,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2382 */ 2138 */
2383 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2139 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2384 2140
2385 /* make sure skb->data is aligned on arches that require it */ 2141 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2386 if (unlikely(NET_IP_ALIGN && ((unsigned long)skb->data & 3))) {
2387 struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER,
2388 GFP_ATOMIC);
2389 return nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
2390 -ENOBUFS;
2391 } else {
2392 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2393 }
2394}
2395
2396int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2397{
2398 struct tcp_sock *tp = tcp_sk(sk);
2399 int err = __tcp_retransmit_skb(sk, skb);
2400 2142
2401 if (err == 0) { 2143 if (err == 0) {
2402 /* Update global TCP statistics. */ 2144 /* Update global TCP statistics. */
@@ -2406,7 +2148,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2406 2148
2407#if FASTRETRANS_DEBUG > 0 2149#if FASTRETRANS_DEBUG > 0
2408 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { 2150 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
2409 net_dbg_ratelimited("retrans_out leaked\n"); 2151 if (net_ratelimit())
2152 printk(KERN_DEBUG "retrans_out leaked.\n");
2410 } 2153 }
2411#endif 2154#endif
2412 if (!tp->retrans_out) 2155 if (!tp->retrans_out)
@@ -2431,18 +2174,18 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2431/* Check if we forward retransmits are possible in the current 2174/* Check if we forward retransmits are possible in the current
2432 * window/congestion state. 2175 * window/congestion state.
2433 */ 2176 */
2434static bool tcp_can_forward_retransmit(struct sock *sk) 2177static int tcp_can_forward_retransmit(struct sock *sk)
2435{ 2178{
2436 const struct inet_connection_sock *icsk = inet_csk(sk); 2179 const struct inet_connection_sock *icsk = inet_csk(sk);
2437 const struct tcp_sock *tp = tcp_sk(sk); 2180 struct tcp_sock *tp = tcp_sk(sk);
2438 2181
2439 /* Forward retransmissions are possible only during Recovery. */ 2182 /* Forward retransmissions are possible only during Recovery. */
2440 if (icsk->icsk_ca_state != TCP_CA_Recovery) 2183 if (icsk->icsk_ca_state != TCP_CA_Recovery)
2441 return false; 2184 return 0;
2442 2185
2443 /* No forward retransmissions in Reno are possible. */ 2186 /* No forward retransmissions in Reno are possible. */
2444 if (tcp_is_reno(tp)) 2187 if (tcp_is_reno(tp))
2445 return false; 2188 return 0;
2446 2189
2447 /* Yeah, we have to make difficult choice between forward transmission 2190 /* Yeah, we have to make difficult choice between forward transmission
2448 * and retransmission... Both ways have their merits... 2191 * and retransmission... Both ways have their merits...
@@ -2453,9 +2196,9 @@ static bool tcp_can_forward_retransmit(struct sock *sk)
2453 */ 2196 */
2454 2197
2455 if (tcp_may_send_now(sk)) 2198 if (tcp_may_send_now(sk))
2456 return false; 2199 return 0;
2457 2200
2458 return true; 2201 return 1;
2459} 2202}
2460 2203
2461/* This gets called after a retransmit timeout, and the initially 2204/* This gets called after a retransmit timeout, and the initially
@@ -2545,15 +2288,10 @@ begin_fwd:
2545 if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) 2288 if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
2546 continue; 2289 continue;
2547 2290
2548 if (tcp_retransmit_skb(sk, skb)) { 2291 if (tcp_retransmit_skb(sk, skb))
2549 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
2550 return; 2292 return;
2551 }
2552 NET_INC_STATS_BH(sock_net(sk), mib_idx); 2293 NET_INC_STATS_BH(sock_net(sk), mib_idx);
2553 2294
2554 if (tcp_in_cwnd_reduction(sk))
2555 tp->prr_out += tcp_skb_pcount(skb);
2556
2557 if (skb == tcp_write_queue_head(sk)) 2295 if (skb == tcp_write_queue_head(sk))
2558 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 2296 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2559 inet_csk(sk)->icsk_rto, 2297 inet_csk(sk)->icsk_rto,
@@ -2577,7 +2315,7 @@ void tcp_send_fin(struct sock *sk)
2577 mss_now = tcp_current_mss(sk); 2315 mss_now = tcp_current_mss(sk);
2578 2316
2579 if (tcp_send_head(sk) != NULL) { 2317 if (tcp_send_head(sk) != NULL) {
2580 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN; 2318 TCP_SKB_CB(skb)->flags |= TCPHDR_FIN;
2581 TCP_SKB_CB(skb)->end_seq++; 2319 TCP_SKB_CB(skb)->end_seq++;
2582 tp->write_seq++; 2320 tp->write_seq++;
2583 } else { 2321 } else {
@@ -2639,11 +2377,11 @@ int tcp_send_synack(struct sock *sk)
2639 struct sk_buff *skb; 2377 struct sk_buff *skb;
2640 2378
2641 skb = tcp_write_queue_head(sk); 2379 skb = tcp_write_queue_head(sk);
2642 if (skb == NULL || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { 2380 if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPHDR_SYN)) {
2643 pr_debug("%s: wrong queue state\n", __func__); 2381 printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
2644 return -EFAULT; 2382 return -EFAULT;
2645 } 2383 }
2646 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { 2384 if (!(TCP_SKB_CB(skb)->flags & TCPHDR_ACK)) {
2647 if (skb_cloned(skb)) { 2385 if (skb_cloned(skb)) {
2648 struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); 2386 struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
2649 if (nskb == NULL) 2387 if (nskb == NULL)
@@ -2657,27 +2395,17 @@ int tcp_send_synack(struct sock *sk)
2657 skb = nskb; 2395 skb = nskb;
2658 } 2396 }
2659 2397
2660 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; 2398 TCP_SKB_CB(skb)->flags |= TCPHDR_ACK;
2661 TCP_ECN_send_synack(tcp_sk(sk), skb); 2399 TCP_ECN_send_synack(tcp_sk(sk), skb);
2662 } 2400 }
2663 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2401 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2664 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); 2402 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2665} 2403}
2666 2404
2667/** 2405/* Prepare a SYN-ACK. */
2668 * tcp_make_synack - Prepare a SYN-ACK.
2669 * sk: listener socket
2670 * dst: dst entry attached to the SYNACK
2671 * req: request_sock pointer
2672 * rvp: request_values pointer
2673 *
2674 * Allocate one skb and build a SYNACK packet.
2675 * @dst is consumed : Caller should not use it again.
2676 */
2677struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, 2406struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2678 struct request_sock *req, 2407 struct request_sock *req,
2679 struct request_values *rvp, 2408 struct request_values *rvp)
2680 struct tcp_fastopen_cookie *foc)
2681{ 2409{
2682 struct tcp_out_options opts; 2410 struct tcp_out_options opts;
2683 struct tcp_extend_values *xvp = tcp_xv(rvp); 2411 struct tcp_extend_values *xvp = tcp_xv(rvp);
@@ -2693,16 +2421,14 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2693 2421
2694 if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) 2422 if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
2695 s_data_desired = cvp->s_data_desired; 2423 s_data_desired = cvp->s_data_desired;
2696 skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired, 2424 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC);
2697 sk_gfp_atomic(sk, GFP_ATOMIC)); 2425 if (skb == NULL)
2698 if (unlikely(!skb)) {
2699 dst_release(dst);
2700 return NULL; 2426 return NULL;
2701 } 2427
2702 /* Reserve space for headers. */ 2428 /* Reserve space for headers. */
2703 skb_reserve(skb, MAX_TCP_HEADER); 2429 skb_reserve(skb, MAX_TCP_HEADER);
2704 2430
2705 skb_dst_set(skb, dst); 2431 skb_dst_set(skb, dst_clone(dst));
2706 2432
2707 mss = dst_metric_advmss(dst); 2433 mss = dst_metric_advmss(dst);
2708 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) 2434 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
@@ -2737,7 +2463,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2737#endif 2463#endif
2738 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2464 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2739 tcp_header_size = tcp_synack_options(sk, req, mss, 2465 tcp_header_size = tcp_synack_options(sk, req, mss,
2740 skb, &opts, &md5, xvp, foc) 2466 skb, &opts, &md5, xvp)
2741 + sizeof(*th); 2467 + sizeof(*th);
2742 2468
2743 skb_push(skb, tcp_header_size); 2469 skb_push(skb, tcp_header_size);
@@ -2791,8 +2517,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2791 } 2517 }
2792 2518
2793 th->seq = htonl(TCP_SKB_CB(skb)->seq); 2519 th->seq = htonl(TCP_SKB_CB(skb)->seq);
2794 /* XXX data is queued and acked as is. No buffer/window check */ 2520 th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
2795 th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
2796 2521
2797 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ 2522 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
2798 th->window = htons(min(req->rcv_wnd, 65535U)); 2523 th->window = htons(min(req->rcv_wnd, 65535U));
@@ -2813,9 +2538,9 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2813EXPORT_SYMBOL(tcp_make_synack); 2538EXPORT_SYMBOL(tcp_make_synack);
2814 2539
2815/* Do all connect socket setups that can be done AF independent. */ 2540/* Do all connect socket setups that can be done AF independent. */
2816void tcp_connect_init(struct sock *sk) 2541static void tcp_connect_init(struct sock *sk)
2817{ 2542{
2818 const struct dst_entry *dst = __sk_dst_get(sk); 2543 struct dst_entry *dst = __sk_dst_get(sk);
2819 struct tcp_sock *tp = tcp_sk(sk); 2544 struct tcp_sock *tp = tcp_sk(sk);
2820 __u8 rcv_wscale; 2545 __u8 rcv_wscale;
2821 2546
@@ -2868,121 +2593,15 @@ void tcp_connect_init(struct sock *sk)
2868 tp->snd_una = tp->write_seq; 2593 tp->snd_una = tp->write_seq;
2869 tp->snd_sml = tp->write_seq; 2594 tp->snd_sml = tp->write_seq;
2870 tp->snd_up = tp->write_seq; 2595 tp->snd_up = tp->write_seq;
2871 tp->snd_nxt = tp->write_seq; 2596 tp->rcv_nxt = 0;
2872 2597 tp->rcv_wup = 0;
2873 if (likely(!tp->repair)) 2598 tp->copied_seq = 0;
2874 tp->rcv_nxt = 0;
2875 tp->rcv_wup = tp->rcv_nxt;
2876 tp->copied_seq = tp->rcv_nxt;
2877 2599
2878 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; 2600 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
2879 inet_csk(sk)->icsk_retransmits = 0; 2601 inet_csk(sk)->icsk_retransmits = 0;
2880 tcp_clear_retrans(tp); 2602 tcp_clear_retrans(tp);
2881} 2603}
2882 2604
2883static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
2884{
2885 struct tcp_sock *tp = tcp_sk(sk);
2886 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
2887
2888 tcb->end_seq += skb->len;
2889 skb_header_release(skb);
2890 __tcp_add_write_queue_tail(sk, skb);
2891 sk->sk_wmem_queued += skb->truesize;
2892 sk_mem_charge(sk, skb->truesize);
2893 tp->write_seq = tcb->end_seq;
2894 tp->packets_out += tcp_skb_pcount(skb);
2895}
2896
2897/* Build and send a SYN with data and (cached) Fast Open cookie. However,
2898 * queue a data-only packet after the regular SYN, such that regular SYNs
2899 * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges
2900 * only the SYN sequence, the data are retransmitted in the first ACK.
2901 * If cookie is not cached or other error occurs, falls back to send a
2902 * regular SYN with Fast Open cookie request option.
2903 */
2904static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
2905{
2906 struct tcp_sock *tp = tcp_sk(sk);
2907 struct tcp_fastopen_request *fo = tp->fastopen_req;
2908 int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen;
2909 struct sk_buff *syn_data = NULL, *data;
2910 unsigned long last_syn_loss = 0;
2911
2912 tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */
2913 tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
2914 &syn_loss, &last_syn_loss);
2915 /* Recurring FO SYN losses: revert to regular handshake temporarily */
2916 if (syn_loss > 1 &&
2917 time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
2918 fo->cookie.len = -1;
2919 goto fallback;
2920 }
2921
2922 if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE)
2923 fo->cookie.len = -1;
2924 else if (fo->cookie.len <= 0)
2925 goto fallback;
2926
2927 /* MSS for SYN-data is based on cached MSS and bounded by PMTU and
2928 * user-MSS. Reserve maximum option space for middleboxes that add
2929 * private TCP options. The cost is reduced data space in SYN :(
2930 */
2931 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->rx_opt.mss_clamp)
2932 tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
2933 space = tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
2934 MAX_TCP_OPTION_SPACE;
2935
2936 syn_data = skb_copy_expand(syn, skb_headroom(syn), space,
2937 sk->sk_allocation);
2938 if (syn_data == NULL)
2939 goto fallback;
2940
2941 for (i = 0; i < iovlen && syn_data->len < space; ++i) {
2942 struct iovec *iov = &fo->data->msg_iov[i];
2943 unsigned char __user *from = iov->iov_base;
2944 int len = iov->iov_len;
2945
2946 if (syn_data->len + len > space)
2947 len = space - syn_data->len;
2948 else if (i + 1 == iovlen)
2949 /* No more data pending in inet_wait_for_connect() */
2950 fo->data = NULL;
2951
2952 if (skb_add_data(syn_data, from, len))
2953 goto fallback;
2954 }
2955
2956 /* Queue a data-only packet after the regular SYN for retransmission */
2957 data = pskb_copy(syn_data, sk->sk_allocation);
2958 if (data == NULL)
2959 goto fallback;
2960 TCP_SKB_CB(data)->seq++;
2961 TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN;
2962 TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH);
2963 tcp_connect_queue_skb(sk, data);
2964 fo->copied = data->len;
2965
2966 if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) {
2967 tp->syn_data = (fo->copied > 0);
2968 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
2969 goto done;
2970 }
2971 syn_data = NULL;
2972
2973fallback:
2974 /* Send a regular SYN with Fast Open cookie request option */
2975 if (fo->cookie.len > 0)
2976 fo->cookie.len = 0;
2977 err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
2978 if (err)
2979 tp->syn_fastopen = 0;
2980 kfree_skb(syn_data);
2981done:
2982 fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */
2983 return err;
2984}
2985
2986/* Build a SYN and send it off. */ 2605/* Build a SYN and send it off. */
2987int tcp_connect(struct sock *sk) 2606int tcp_connect(struct sock *sk)
2988{ 2607{
@@ -2992,11 +2611,6 @@ int tcp_connect(struct sock *sk)
2992 2611
2993 tcp_connect_init(sk); 2612 tcp_connect_init(sk);
2994 2613
2995 if (unlikely(tp->repair)) {
2996 tcp_finish_connect(sk, NULL);
2997 return 0;
2998 }
2999
3000 buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); 2614 buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
3001 if (unlikely(buff == NULL)) 2615 if (unlikely(buff == NULL))
3002 return -ENOBUFS; 2616 return -ENOBUFS;
@@ -3004,14 +2618,19 @@ int tcp_connect(struct sock *sk)
3004 /* Reserve space for headers. */ 2618 /* Reserve space for headers. */
3005 skb_reserve(buff, MAX_TCP_HEADER); 2619 skb_reserve(buff, MAX_TCP_HEADER);
3006 2620
2621 tp->snd_nxt = tp->write_seq;
3007 tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); 2622 tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
3008 tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp;
3009 tcp_connect_queue_skb(sk, buff);
3010 TCP_ECN_send_syn(sk, buff); 2623 TCP_ECN_send_syn(sk, buff);
3011 2624
3012 /* Send off SYN; include data in Fast Open. */ 2625 /* Send it off. */
3013 err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : 2626 TCP_SKB_CB(buff)->when = tcp_time_stamp;
3014 tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); 2627 tp->retrans_stamp = TCP_SKB_CB(buff)->when;
2628 skb_header_release(buff);
2629 __tcp_add_write_queue_tail(sk, buff);
2630 sk->sk_wmem_queued += buff->truesize;
2631 sk_mem_charge(sk, buff->truesize);
2632 tp->packets_out += tcp_skb_pcount(buff);
2633 err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
3015 if (err == -ECONNREFUSED) 2634 if (err == -ECONNREFUSED)
3016 return err; 2635 return err;
3017 2636
@@ -3098,7 +2717,7 @@ void tcp_send_ack(struct sock *sk)
3098 * tcp_transmit_skb() will set the ownership to this 2717 * tcp_transmit_skb() will set the ownership to this
3099 * sock. 2718 * sock.
3100 */ 2719 */
3101 buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC)); 2720 buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
3102 if (buff == NULL) { 2721 if (buff == NULL) {
3103 inet_csk_schedule_ack(sk); 2722 inet_csk_schedule_ack(sk);
3104 inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; 2723 inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
@@ -3113,7 +2732,7 @@ void tcp_send_ack(struct sock *sk)
3113 2732
3114 /* Send it off, this clears delayed acks for us. */ 2733 /* Send it off, this clears delayed acks for us. */
3115 TCP_SKB_CB(buff)->when = tcp_time_stamp; 2734 TCP_SKB_CB(buff)->when = tcp_time_stamp;
3116 tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); 2735 tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
3117} 2736}
3118 2737
3119/* This routine sends a packet with an out of date sequence 2738/* This routine sends a packet with an out of date sequence
@@ -3133,7 +2752,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
3133 struct sk_buff *skb; 2752 struct sk_buff *skb;
3134 2753
3135 /* We don't queue it, tcp_transmit_skb() sets ownership. */ 2754 /* We don't queue it, tcp_transmit_skb() sets ownership. */
3136 skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC)); 2755 skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
3137 if (skb == NULL) 2756 if (skb == NULL)
3138 return -1; 2757 return -1;
3139 2758
@@ -3148,15 +2767,6 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
3148 return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); 2767 return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
3149} 2768}
3150 2769
3151void tcp_send_window_probe(struct sock *sk)
3152{
3153 if (sk->sk_state == TCP_ESTABLISHED) {
3154 tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
3155 tcp_sk(sk)->snd_nxt = tcp_sk(sk)->write_seq;
3156 tcp_xmit_probe_skb(sk, 0);
3157 }
3158}
3159
3160/* Initiate keepalive or window probe from timer. */ 2770/* Initiate keepalive or window probe from timer. */
3161int tcp_write_wakeup(struct sock *sk) 2771int tcp_write_wakeup(struct sock *sk)
3162{ 2772{
@@ -3182,13 +2792,13 @@ int tcp_write_wakeup(struct sock *sk)
3182 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || 2792 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
3183 skb->len > mss) { 2793 skb->len > mss) {
3184 seg_size = min(seg_size, mss); 2794 seg_size = min(seg_size, mss);
3185 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; 2795 TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
3186 if (tcp_fragment(sk, skb, seg_size, mss)) 2796 if (tcp_fragment(sk, skb, seg_size, mss))
3187 return -1; 2797 return -1;
3188 } else if (!tcp_skb_pcount(skb)) 2798 } else if (!tcp_skb_pcount(skb))
3189 tcp_set_skb_tso_segs(sk, skb, mss); 2799 tcp_set_skb_tso_segs(sk, skb, mss);
3190 2800
3191 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; 2801 TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
3192 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2802 TCP_SKB_CB(skb)->when = tcp_time_stamp;
3193 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); 2803 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
3194 if (!err) 2804 if (!err)
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 4526fe68e60..85ee7eb7e38 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -18,8 +18,6 @@
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */ 19 */
20 20
21#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
22
23#include <linux/kernel.h> 21#include <linux/kernel.h>
24#include <linux/kprobes.h> 22#include <linux/kprobes.h>
25#include <linux/socket.h> 23#include <linux/socket.h>
@@ -91,7 +89,7 @@ static inline int tcp_probe_avail(void)
91 * Note: arguments must match tcp_rcv_established()! 89 * Note: arguments must match tcp_rcv_established()!
92 */ 90 */
93static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, 91static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
94 struct tcphdr *th, unsigned int len) 92 struct tcphdr *th, unsigned len)
95{ 93{
96 const struct tcp_sock *tp = tcp_sk(sk); 94 const struct tcp_sock *tp = tcp_sk(sk);
97 const struct inet_sock *inet = inet_sk(sk); 95 const struct inet_sock *inet = inet_sk(sk);
@@ -138,7 +136,7 @@ static struct jprobe tcp_jprobe = {
138 .entry = jtcp_rcv_established, 136 .entry = jtcp_rcv_established,
139}; 137};
140 138
141static int tcpprobe_open(struct inode *inode, struct file *file) 139static int tcpprobe_open(struct inode * inode, struct file * file)
142{ 140{
143 /* Reset (empty) log */ 141 /* Reset (empty) log */
144 spin_lock_bh(&tcp_probe.lock); 142 spin_lock_bh(&tcp_probe.lock);
@@ -241,7 +239,7 @@ static __init int tcpprobe_init(void)
241 if (ret) 239 if (ret)
242 goto err1; 240 goto err1;
243 241
244 pr_info("probe registered (port=%d) bufsize=%u\n", port, bufsize); 242 pr_info("TCP probe registered (port=%d) bufsize=%u\n", port, bufsize);
245 return 0; 243 return 0;
246 err1: 244 err1:
247 proc_net_remove(&init_net, procname); 245 proc_net_remove(&init_net, procname);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b78aac30c49..ecd44b0c45f 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -32,6 +32,17 @@ int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
32int sysctl_tcp_orphan_retries __read_mostly; 32int sysctl_tcp_orphan_retries __read_mostly;
33int sysctl_tcp_thin_linear_timeouts __read_mostly; 33int sysctl_tcp_thin_linear_timeouts __read_mostly;
34 34
35static void tcp_write_timer(unsigned long);
36static void tcp_delack_timer(unsigned long);
37static void tcp_keepalive_timer (unsigned long data);
38
39void tcp_init_xmit_timers(struct sock *sk)
40{
41 inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
42 &tcp_keepalive_timer);
43}
44EXPORT_SYMBOL(tcp_init_xmit_timers);
45
35static void tcp_write_err(struct sock *sk) 46static void tcp_write_err(struct sock *sk)
36{ 47{
37 sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; 48 sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
@@ -66,7 +77,10 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset)
66 if (sk->sk_err_soft) 77 if (sk->sk_err_soft)
67 shift++; 78 shift++;
68 79
69 if (tcp_check_oom(sk, shift)) { 80 if (tcp_too_many_orphans(sk, shift)) {
81 if (net_ratelimit())
82 printk(KERN_INFO "Out of socket memory\n");
83
70 /* Catch exceptional cases, when connection requires reset. 84 /* Catch exceptional cases, when connection requires reset.
71 * 1. Last segment was sent recently. */ 85 * 1. Last segment was sent recently. */
72 if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN || 86 if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
@@ -157,13 +171,13 @@ static int tcp_write_timeout(struct sock *sk)
157{ 171{
158 struct inet_connection_sock *icsk = inet_csk(sk); 172 struct inet_connection_sock *icsk = inet_csk(sk);
159 int retry_until; 173 int retry_until;
160 bool do_reset, syn_set = false; 174 bool do_reset, syn_set = 0;
161 175
162 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { 176 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
163 if (icsk->icsk_retransmits) 177 if (icsk->icsk_retransmits)
164 dst_negative_advice(sk); 178 dst_negative_advice(sk);
165 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; 179 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
166 syn_set = true; 180 syn_set = 1;
167 } else { 181 } else {
168 if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) { 182 if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
169 /* Black hole detection */ 183 /* Black hole detection */
@@ -194,11 +208,21 @@ static int tcp_write_timeout(struct sock *sk)
194 return 0; 208 return 0;
195} 209}
196 210
197void tcp_delack_timer_handler(struct sock *sk) 211static void tcp_delack_timer(unsigned long data)
198{ 212{
213 struct sock *sk = (struct sock *)data;
199 struct tcp_sock *tp = tcp_sk(sk); 214 struct tcp_sock *tp = tcp_sk(sk);
200 struct inet_connection_sock *icsk = inet_csk(sk); 215 struct inet_connection_sock *icsk = inet_csk(sk);
201 216
217 bh_lock_sock(sk);
218 if (sock_owned_by_user(sk)) {
219 /* Try again later. */
220 icsk->icsk_ack.blocked = 1;
221 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
222 sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN);
223 goto out_unlock;
224 }
225
202 sk_mem_reclaim_partial(sk); 226 sk_mem_reclaim_partial(sk);
203 227
204 if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) 228 if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
@@ -237,24 +261,9 @@ void tcp_delack_timer_handler(struct sock *sk)
237 } 261 }
238 262
239out: 263out:
240 if (sk_under_memory_pressure(sk)) 264 if (tcp_memory_pressure)
241 sk_mem_reclaim(sk); 265 sk_mem_reclaim(sk);
242} 266out_unlock:
243
244static void tcp_delack_timer(unsigned long data)
245{
246 struct sock *sk = (struct sock *)data;
247
248 bh_lock_sock(sk);
249 if (!sock_owned_by_user(sk)) {
250 tcp_delack_timer_handler(sk);
251 } else {
252 inet_csk(sk)->icsk_ack.blocked = 1;
253 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
254 /* deleguate our work to tcp_release_cb() */
255 if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
256 sock_hold(sk);
257 }
258 bh_unlock_sock(sk); 267 bh_unlock_sock(sk);
259 sock_put(sk); 268 sock_put(sk);
260} 269}
@@ -305,35 +314,6 @@ static void tcp_probe_timer(struct sock *sk)
305} 314}
306 315
307/* 316/*
308 * Timer for Fast Open socket to retransmit SYNACK. Note that the
309 * sk here is the child socket, not the parent (listener) socket.
310 */
311static void tcp_fastopen_synack_timer(struct sock *sk)
312{
313 struct inet_connection_sock *icsk = inet_csk(sk);
314 int max_retries = icsk->icsk_syn_retries ? :
315 sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
316 struct request_sock *req;
317
318 req = tcp_sk(sk)->fastopen_rsk;
319 req->rsk_ops->syn_ack_timeout(sk, req);
320
321 if (req->num_timeout >= max_retries) {
322 tcp_write_err(sk);
323 return;
324 }
325 /* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error
326 * returned from rtx_syn_ack() to make it more persistent like
327 * regular retransmit because if the child socket has been accepted
328 * it's not good to give up too easily.
329 */
330 inet_rtx_syn_ack(sk, req);
331 req->num_timeout++;
332 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
333 TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
334}
335
336/*
337 * The TCP retransmit timer. 317 * The TCP retransmit timer.
338 */ 318 */
339 319
@@ -342,19 +322,6 @@ void tcp_retransmit_timer(struct sock *sk)
342 struct tcp_sock *tp = tcp_sk(sk); 322 struct tcp_sock *tp = tcp_sk(sk);
343 struct inet_connection_sock *icsk = inet_csk(sk); 323 struct inet_connection_sock *icsk = inet_csk(sk);
344 324
345 if (tp->early_retrans_delayed) {
346 tcp_resume_early_retransmit(sk);
347 return;
348 }
349 if (tp->fastopen_rsk) {
350 WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
351 sk->sk_state != TCP_FIN_WAIT1);
352 tcp_fastopen_synack_timer(sk);
353 /* Before we receive ACK to our SYN-ACK don't retransmit
354 * anything else (e.g., data or FIN segments).
355 */
356 return;
357 }
358 if (!tp->packets_out) 325 if (!tp->packets_out)
359 goto out; 326 goto out;
360 327
@@ -367,22 +334,22 @@ void tcp_retransmit_timer(struct sock *sk)
367 * connection. If the socket is an orphan, time it out, 334 * connection. If the socket is an orphan, time it out,
368 * we cannot allow such beasts to hang infinitely. 335 * we cannot allow such beasts to hang infinitely.
369 */ 336 */
337#ifdef TCP_DEBUG
370 struct inet_sock *inet = inet_sk(sk); 338 struct inet_sock *inet = inet_sk(sk);
371 if (sk->sk_family == AF_INET) { 339 if (sk->sk_family == AF_INET) {
372 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"), 340 LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
373 &inet->inet_daddr, 341 &inet->inet_daddr, ntohs(inet->inet_dport),
374 ntohs(inet->inet_dport), inet->inet_num, 342 inet->inet_num, tp->snd_una, tp->snd_nxt);
375 tp->snd_una, tp->snd_nxt);
376 } 343 }
377#if IS_ENABLED(CONFIG_IPV6) 344#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
378 else if (sk->sk_family == AF_INET6) { 345 else if (sk->sk_family == AF_INET6) {
379 struct ipv6_pinfo *np = inet6_sk(sk); 346 struct ipv6_pinfo *np = inet6_sk(sk);
380 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"), 347 LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
381 &np->daddr, 348 &np->daddr, ntohs(inet->inet_dport),
382 ntohs(inet->inet_dport), inet->inet_num, 349 inet->inet_num, tp->snd_una, tp->snd_nxt);
383 tp->snd_una, tp->snd_nxt);
384 } 350 }
385#endif 351#endif
352#endif
386 if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) { 353 if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
387 tcp_write_err(sk); 354 tcp_write_err(sk);
388 goto out; 355 goto out;
@@ -481,11 +448,19 @@ out_reset_timer:
481out:; 448out:;
482} 449}
483 450
484void tcp_write_timer_handler(struct sock *sk) 451static void tcp_write_timer(unsigned long data)
485{ 452{
453 struct sock *sk = (struct sock *)data;
486 struct inet_connection_sock *icsk = inet_csk(sk); 454 struct inet_connection_sock *icsk = inet_csk(sk);
487 int event; 455 int event;
488 456
457 bh_lock_sock(sk);
458 if (sock_owned_by_user(sk)) {
459 /* Try again later */
460 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20));
461 goto out_unlock;
462 }
463
489 if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending) 464 if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
490 goto out; 465 goto out;
491 466
@@ -508,20 +483,7 @@ void tcp_write_timer_handler(struct sock *sk)
508 483
509out: 484out:
510 sk_mem_reclaim(sk); 485 sk_mem_reclaim(sk);
511} 486out_unlock:
512
513static void tcp_write_timer(unsigned long data)
514{
515 struct sock *sk = (struct sock *)data;
516
517 bh_lock_sock(sk);
518 if (!sock_owned_by_user(sk)) {
519 tcp_write_timer_handler(sk);
520 } else {
521 /* deleguate our work to tcp_release_cb() */
522 if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
523 sock_hold(sk);
524 }
525 bh_unlock_sock(sk); 487 bh_unlock_sock(sk);
526 sock_put(sk); 488 sock_put(sk);
527} 489}
@@ -638,10 +600,3 @@ out:
638 bh_unlock_sock(sk); 600 bh_unlock_sock(sk);
639 sock_put(sk); 601 sock_put(sk);
640} 602}
641
642void tcp_init_xmit_timers(struct sock *sk)
643{
644 inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
645 &tcp_keepalive_timer);
646}
647EXPORT_SYMBOL(tcp_init_xmit_timers);
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index 0d017183062..ac3b3ee4b07 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -105,7 +105,7 @@ drop:
105 return 0; 105 return 0;
106} 106}
107 107
108#if IS_ENABLED(CONFIG_IPV6) 108#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
109static int tunnel64_rcv(struct sk_buff *skb) 109static int tunnel64_rcv(struct sk_buff *skb)
110{ 110{
111 struct xfrm_tunnel *handler; 111 struct xfrm_tunnel *handler;
@@ -134,7 +134,7 @@ static void tunnel4_err(struct sk_buff *skb, u32 info)
134 break; 134 break;
135} 135}
136 136
137#if IS_ENABLED(CONFIG_IPV6) 137#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
138static void tunnel64_err(struct sk_buff *skb, u32 info) 138static void tunnel64_err(struct sk_buff *skb, u32 info)
139{ 139{
140 struct xfrm_tunnel *handler; 140 struct xfrm_tunnel *handler;
@@ -152,7 +152,7 @@ static const struct net_protocol tunnel4_protocol = {
152 .netns_ok = 1, 152 .netns_ok = 1,
153}; 153};
154 154
155#if IS_ENABLED(CONFIG_IPV6) 155#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
156static const struct net_protocol tunnel64_protocol = { 156static const struct net_protocol tunnel64_protocol = {
157 .handler = tunnel64_rcv, 157 .handler = tunnel64_rcv,
158 .err_handler = tunnel64_err, 158 .err_handler = tunnel64_err,
@@ -164,12 +164,12 @@ static const struct net_protocol tunnel64_protocol = {
164static int __init tunnel4_init(void) 164static int __init tunnel4_init(void)
165{ 165{
166 if (inet_add_protocol(&tunnel4_protocol, IPPROTO_IPIP)) { 166 if (inet_add_protocol(&tunnel4_protocol, IPPROTO_IPIP)) {
167 pr_err("%s: can't add protocol\n", __func__); 167 printk(KERN_ERR "tunnel4 init: can't add protocol\n");
168 return -EAGAIN; 168 return -EAGAIN;
169 } 169 }
170#if IS_ENABLED(CONFIG_IPV6) 170#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
171 if (inet_add_protocol(&tunnel64_protocol, IPPROTO_IPV6)) { 171 if (inet_add_protocol(&tunnel64_protocol, IPPROTO_IPV6)) {
172 pr_err("tunnel64 init: can't add protocol\n"); 172 printk(KERN_ERR "tunnel64 init: can't add protocol\n");
173 inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP); 173 inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP);
174 return -EAGAIN; 174 return -EAGAIN;
175 } 175 }
@@ -179,12 +179,12 @@ static int __init tunnel4_init(void)
179 179
180static void __exit tunnel4_fini(void) 180static void __exit tunnel4_fini(void)
181{ 181{
182#if IS_ENABLED(CONFIG_IPV6) 182#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
183 if (inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6)) 183 if (inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6))
184 pr_err("tunnel64 close: can't remove protocol\n"); 184 printk(KERN_ERR "tunnel64 close: can't remove protocol\n");
185#endif 185#endif
186 if (inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP)) 186 if (inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP))
187 pr_err("tunnel4 close: can't remove protocol\n"); 187 printk(KERN_ERR "tunnel4 close: can't remove protocol\n");
188} 188}
189 189
190module_init(tunnel4_init); 190module_init(tunnel4_init);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 79c8dbe59b5..1b5a19340a9 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -77,8 +77,7 @@
77 * 2 of the License, or (at your option) any later version. 77 * 2 of the License, or (at your option) any later version.
78 */ 78 */
79 79
80#define pr_fmt(fmt) "UDP: " fmt 80#include <asm/system.h>
81
82#include <asm/uaccess.h> 81#include <asm/uaccess.h>
83#include <asm/ioctls.h> 82#include <asm/ioctls.h>
84#include <linux/bootmem.h> 83#include <linux/bootmem.h>
@@ -107,8 +106,6 @@
107#include <net/checksum.h> 106#include <net/checksum.h>
108#include <net/xfrm.h> 107#include <net/xfrm.h>
109#include <trace/events/udp.h> 108#include <trace/events/udp.h>
110#include <linux/static_key.h>
111#include <trace/events/skb.h>
112#include "udp_impl.h" 109#include "udp_impl.h"
113 110
114struct udp_table udp_table __read_mostly; 111struct udp_table udp_table __read_mostly;
@@ -208,7 +205,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
208 205
209 if (!snum) { 206 if (!snum) {
210 int low, high, remaining; 207 int low, high, remaining;
211 unsigned int rand; 208 unsigned rand;
212 unsigned short first, last; 209 unsigned short first, last;
213 DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN); 210 DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
214 211
@@ -448,7 +445,7 @@ exact_match:
448/* UDP is nearly always wildcards out the wazoo, it makes no sense to try 445/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
449 * harder than this. -DaveM 446 * harder than this. -DaveM
450 */ 447 */
451struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, 448static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
452 __be16 sport, __be32 daddr, __be16 dport, 449 __be16 sport, __be32 daddr, __be16 dport,
453 int dif, struct udp_table *udptable) 450 int dif, struct udp_table *udptable)
454{ 451{
@@ -515,7 +512,6 @@ begin:
515 rcu_read_unlock(); 512 rcu_read_unlock();
516 return result; 513 return result;
517} 514}
518EXPORT_SYMBOL_GPL(__udp4_lib_lookup);
519 515
520static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, 516static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
521 __be16 sport, __be16 dport, 517 __be16 sport, __be16 dport,
@@ -616,7 +612,6 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
616 break; 612 break;
617 case ICMP_DEST_UNREACH: 613 case ICMP_DEST_UNREACH:
618 if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ 614 if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
619 ipv4_sk_update_pmtu(skb, sk, info);
620 if (inet->pmtudisc != IP_PMTUDISC_DONT) { 615 if (inet->pmtudisc != IP_PMTUDISC_DONT) {
621 err = EMSGSIZE; 616 err = EMSGSIZE;
622 harderr = 1; 617 harderr = 1;
@@ -630,9 +625,6 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
630 err = icmp_err_convert[code].errno; 625 err = icmp_err_convert[code].errno;
631 } 626 }
632 break; 627 break;
633 case ICMP_REDIRECT:
634 ipv4_sk_redirect(skb, sk);
635 break;
636 } 628 }
637 629
638 /* 630 /*
@@ -758,7 +750,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
758 uh->check = CSUM_MANGLED_0; 750 uh->check = CSUM_MANGLED_0;
759 751
760send: 752send:
761 err = ip_send_skb(sock_net(sk), skb); 753 err = ip_send_skb(skb);
762 if (err) { 754 if (err) {
763 if (err == -ENOBUFS && !inet->recverr) { 755 if (err == -ENOBUFS && !inet->recverr) {
764 UDP_INC_STATS_USER(sock_net(sk), 756 UDP_INC_STATS_USER(sock_net(sk),
@@ -852,7 +844,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
852 * Get and verify the address. 844 * Get and verify the address.
853 */ 845 */
854 if (msg->msg_name) { 846 if (msg->msg_name) {
855 struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; 847 struct sockaddr_in * usin = (struct sockaddr_in *)msg->msg_name;
856 if (msg->msg_namelen < sizeof(*usin)) 848 if (msg->msg_namelen < sizeof(*usin))
857 return -EINVAL; 849 return -EINVAL;
858 if (usin->sin_family != AF_INET) { 850 if (usin->sin_family != AF_INET) {
@@ -924,8 +916,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
924 if (!saddr) 916 if (!saddr)
925 saddr = inet->mc_addr; 917 saddr = inet->mc_addr;
926 connected = 0; 918 connected = 0;
927 } else if (!ipc.oif) 919 }
928 ipc.oif = inet->uc_index;
929 920
930 if (connected) 921 if (connected)
931 rt = (struct rtable *)sk_dst_check(sk, 0); 922 rt = (struct rtable *)sk_dst_check(sk, 0);
@@ -982,7 +973,7 @@ back_from_confirm:
982 /* ... which is an evident application bug. --ANK */ 973 /* ... which is an evident application bug. --ANK */
983 release_sock(sk); 974 release_sock(sk);
984 975
985 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("cork app bug 2\n")); 976 LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n");
986 err = -EINVAL; 977 err = -EINVAL;
987 goto out; 978 goto out;
988 } 979 }
@@ -1061,7 +1052,7 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset,
1061 if (unlikely(!up->pending)) { 1052 if (unlikely(!up->pending)) {
1062 release_sock(sk); 1053 release_sock(sk);
1063 1054
1064 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("udp cork app bug 3\n")); 1055 LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 3\n");
1065 return -EINVAL; 1056 return -EINVAL;
1066 } 1057 }
1067 1058
@@ -1173,8 +1164,8 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1173 struct inet_sock *inet = inet_sk(sk); 1164 struct inet_sock *inet = inet_sk(sk);
1174 struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; 1165 struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
1175 struct sk_buff *skb; 1166 struct sk_buff *skb;
1176 unsigned int ulen, copied; 1167 unsigned int ulen;
1177 int peeked, off = 0; 1168 int peeked;
1178 int err; 1169 int err;
1179 int is_udplite = IS_UDPLITE(sk); 1170 int is_udplite = IS_UDPLITE(sk);
1180 bool slow; 1171 bool slow;
@@ -1190,15 +1181,14 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1190 1181
1191try_again: 1182try_again:
1192 skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), 1183 skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
1193 &peeked, &off, &err); 1184 &peeked, &err);
1194 if (!skb) 1185 if (!skb)
1195 goto out; 1186 goto out;
1196 1187
1197 ulen = skb->len - sizeof(struct udphdr); 1188 ulen = skb->len - sizeof(struct udphdr);
1198 copied = len; 1189 if (len > ulen)
1199 if (copied > ulen) 1190 len = ulen;
1200 copied = ulen; 1191 else if (len < ulen)
1201 else if (copied < ulen)
1202 msg->msg_flags |= MSG_TRUNC; 1192 msg->msg_flags |= MSG_TRUNC;
1203 1193
1204 /* 1194 /*
@@ -1207,14 +1197,14 @@ try_again:
1207 * coverage checksum (UDP-Lite), do it before the copy. 1197 * coverage checksum (UDP-Lite), do it before the copy.
1208 */ 1198 */
1209 1199
1210 if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) { 1200 if (len < ulen || UDP_SKB_CB(skb)->partial_cov) {
1211 if (udp_lib_checksum_complete(skb)) 1201 if (udp_lib_checksum_complete(skb))
1212 goto csum_copy_err; 1202 goto csum_copy_err;
1213 } 1203 }
1214 1204
1215 if (skb_csum_unnecessary(skb)) 1205 if (skb_csum_unnecessary(skb))
1216 err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), 1206 err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr),
1217 msg->msg_iov, copied); 1207 msg->msg_iov, len);
1218 else { 1208 else {
1219 err = skb_copy_and_csum_datagram_iovec(skb, 1209 err = skb_copy_and_csum_datagram_iovec(skb,
1220 sizeof(struct udphdr), 1210 sizeof(struct udphdr),
@@ -1224,15 +1214,8 @@ try_again:
1224 goto csum_copy_err; 1214 goto csum_copy_err;
1225 } 1215 }
1226 1216
1227 if (unlikely(err)) { 1217 if (err)
1228 trace_kfree_skb(skb, udp_recvmsg);
1229 if (!peeked) {
1230 atomic_inc(&sk->sk_drops);
1231 UDP_INC_STATS_USER(sock_net(sk),
1232 UDP_MIB_INERRORS, is_udplite);
1233 }
1234 goto out_free; 1218 goto out_free;
1235 }
1236 1219
1237 if (!peeked) 1220 if (!peeked)
1238 UDP_INC_STATS_USER(sock_net(sk), 1221 UDP_INC_STATS_USER(sock_net(sk),
@@ -1250,7 +1233,7 @@ try_again:
1250 if (inet->cmsg_flags) 1233 if (inet->cmsg_flags)
1251 ip_cmsg_recv(msg, skb); 1234 ip_cmsg_recv(msg, skb);
1252 1235
1253 err = copied; 1236 err = len;
1254 if (flags & MSG_TRUNC) 1237 if (flags & MSG_TRUNC)
1255 err = ulen; 1238 err = ulen;
1256 1239
@@ -1284,7 +1267,7 @@ int udp_disconnect(struct sock *sk, int flags)
1284 sk->sk_state = TCP_CLOSE; 1267 sk->sk_state = TCP_CLOSE;
1285 inet->inet_daddr = 0; 1268 inet->inet_daddr = 0;
1286 inet->inet_dport = 0; 1269 inet->inet_dport = 0;
1287 sock_rps_reset_rxhash(sk); 1270 sock_rps_save_rxhash(sk, 0);
1288 sk->sk_bound_dev_if = 0; 1271 sk->sk_bound_dev_if = 0;
1289 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) 1272 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1290 inet_reset_saddr(sk); 1273 inet_reset_saddr(sk);
@@ -1372,9 +1355,9 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1372 int rc; 1355 int rc;
1373 1356
1374 if (inet_sk(sk)->inet_daddr) 1357 if (inet_sk(sk)->inet_daddr)
1375 sock_rps_save_rxhash(sk, skb); 1358 sock_rps_save_rxhash(sk, skb->rxhash);
1376 1359
1377 rc = sock_queue_rcv_skb(sk, skb); 1360 rc = ip_queue_rcv_skb(sk, skb);
1378 if (rc < 0) { 1361 if (rc < 0) {
1379 int is_udplite = IS_UDPLITE(sk); 1362 int is_udplite = IS_UDPLITE(sk);
1380 1363
@@ -1392,14 +1375,6 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1392 1375
1393} 1376}
1394 1377
1395static struct static_key udp_encap_needed __read_mostly;
1396void udp_encap_enable(void)
1397{
1398 if (!static_key_enabled(&udp_encap_needed))
1399 static_key_slow_inc(&udp_encap_needed);
1400}
1401EXPORT_SYMBOL(udp_encap_enable);
1402
1403/* returns: 1378/* returns:
1404 * -1: error 1379 * -1: error
1405 * 0: success 1380 * 0: success
@@ -1421,9 +1396,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1421 goto drop; 1396 goto drop;
1422 nf_reset(skb); 1397 nf_reset(skb);
1423 1398
1424 if (static_key_false(&udp_encap_needed) && up->encap_type) { 1399 if (up->encap_type) {
1425 int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
1426
1427 /* 1400 /*
1428 * This is an encapsulation socket so pass the skb to 1401 * This is an encapsulation socket so pass the skb to
1429 * the socket's udp_encap_rcv() hook. Otherwise, just 1402 * the socket's udp_encap_rcv() hook. Otherwise, just
@@ -1436,11 +1409,11 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1436 */ 1409 */
1437 1410
1438 /* if we're overly short, let UDP handle it */ 1411 /* if we're overly short, let UDP handle it */
1439 encap_rcv = ACCESS_ONCE(up->encap_rcv); 1412 if (skb->len > sizeof(struct udphdr) &&
1440 if (skb->len > sizeof(struct udphdr) && encap_rcv != NULL) { 1413 up->encap_rcv != NULL) {
1441 int ret; 1414 int ret;
1442 1415
1443 ret = encap_rcv(sk, skb); 1416 ret = (*up->encap_rcv)(sk, skb);
1444 if (ret <= 0) { 1417 if (ret <= 0) {
1445 UDP_INC_STATS_BH(sock_net(sk), 1418 UDP_INC_STATS_BH(sock_net(sk),
1446 UDP_MIB_INDATAGRAMS, 1419 UDP_MIB_INDATAGRAMS,
@@ -1469,8 +1442,9 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1469 * provided by the application." 1442 * provided by the application."
1470 */ 1443 */
1471 if (up->pcrlen == 0) { /* full coverage was set */ 1444 if (up->pcrlen == 0) { /* full coverage was set */
1472 LIMIT_NETDEBUG(KERN_WARNING "UDPLite: partial coverage %d while full coverage %d requested\n", 1445 LIMIT_NETDEBUG(KERN_WARNING "UDPLITE: partial coverage "
1473 UDP_SKB_CB(skb)->cscov, skb->len); 1446 "%d while full coverage %d requested\n",
1447 UDP_SKB_CB(skb)->cscov, skb->len);
1474 goto drop; 1448 goto drop;
1475 } 1449 }
1476 /* The next case involves violating the min. coverage requested 1450 /* The next case involves violating the min. coverage requested
@@ -1480,27 +1454,28 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1480 * Therefore the above ...()->partial_cov statement is essential. 1454 * Therefore the above ...()->partial_cov statement is essential.
1481 */ 1455 */
1482 if (UDP_SKB_CB(skb)->cscov < up->pcrlen) { 1456 if (UDP_SKB_CB(skb)->cscov < up->pcrlen) {
1483 LIMIT_NETDEBUG(KERN_WARNING "UDPLite: coverage %d too small, need min %d\n", 1457 LIMIT_NETDEBUG(KERN_WARNING
1484 UDP_SKB_CB(skb)->cscov, up->pcrlen); 1458 "UDPLITE: coverage %d too small, need min %d\n",
1459 UDP_SKB_CB(skb)->cscov, up->pcrlen);
1485 goto drop; 1460 goto drop;
1486 } 1461 }
1487 } 1462 }
1488 1463
1489 if (rcu_access_pointer(sk->sk_filter) && 1464 if (rcu_dereference_raw(sk->sk_filter)) {
1490 udp_lib_checksum_complete(skb)) 1465 if (udp_lib_checksum_complete(skb))
1491 goto drop; 1466 goto drop;
1467 }
1492 1468
1493 1469
1494 if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) 1470 if (sk_rcvqueues_full(sk, skb))
1495 goto drop; 1471 goto drop;
1496 1472
1497 rc = 0; 1473 rc = 0;
1498 1474
1499 ipv4_pktinfo_prepare(skb);
1500 bh_lock_sock(sk); 1475 bh_lock_sock(sk);
1501 if (!sock_owned_by_user(sk)) 1476 if (!sock_owned_by_user(sk))
1502 rc = __udp_queue_rcv_skb(sk, skb); 1477 rc = __udp_queue_rcv_skb(sk, skb);
1503 else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) { 1478 else if (sk_add_backlog(sk, skb)) {
1504 bh_unlock_sock(sk); 1479 bh_unlock_sock(sk);
1505 goto drop; 1480 goto drop;
1506 } 1481 }
@@ -1709,10 +1684,13 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
1709 1684
1710short_packet: 1685short_packet:
1711 LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n", 1686 LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n",
1712 proto == IPPROTO_UDPLITE ? "Lite" : "", 1687 proto == IPPROTO_UDPLITE ? "-Lite" : "",
1713 &saddr, ntohs(uh->source), 1688 &saddr,
1714 ulen, skb->len, 1689 ntohs(uh->source),
1715 &daddr, ntohs(uh->dest)); 1690 ulen,
1691 skb->len,
1692 &daddr,
1693 ntohs(uh->dest));
1716 goto drop; 1694 goto drop;
1717 1695
1718csum_error: 1696csum_error:
@@ -1721,8 +1699,11 @@ csum_error:
1721 * the network is concerned, anyway) as per 4.1.3.4 (MUST). 1699 * the network is concerned, anyway) as per 4.1.3.4 (MUST).
1722 */ 1700 */
1723 LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n", 1701 LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n",
1724 proto == IPPROTO_UDPLITE ? "Lite" : "", 1702 proto == IPPROTO_UDPLITE ? "-Lite" : "",
1725 &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest), 1703 &saddr,
1704 ntohs(uh->source),
1705 &daddr,
1706 ntohs(uh->dest),
1726 ulen); 1707 ulen);
1727drop: 1708drop:
1728 UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); 1709 UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
@@ -1781,7 +1762,6 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
1781 /* FALLTHROUGH */ 1762 /* FALLTHROUGH */
1782 case UDP_ENCAP_L2TPINUDP: 1763 case UDP_ENCAP_L2TPINUDP:
1783 up->encap_type = val; 1764 up->encap_type = val;
1784 udp_encap_enable();
1785 break; 1765 break;
1786 default: 1766 default:
1787 err = -ENOPROTOOPT; 1767 err = -ENOPROTOOPT;
@@ -2058,7 +2038,7 @@ static void udp_seq_stop(struct seq_file *seq, void *v)
2058 spin_unlock_bh(&state->udp_table->hash[state->bucket].lock); 2038 spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
2059} 2039}
2060 2040
2061int udp_seq_open(struct inode *inode, struct file *file) 2041static int udp_seq_open(struct inode *inode, struct file *file)
2062{ 2042{
2063 struct udp_seq_afinfo *afinfo = PDE(inode)->data; 2043 struct udp_seq_afinfo *afinfo = PDE(inode)->data;
2064 struct udp_iter_state *s; 2044 struct udp_iter_state *s;
@@ -2074,7 +2054,6 @@ int udp_seq_open(struct inode *inode, struct file *file)
2074 s->udp_table = afinfo->udp_table; 2054 s->udp_table = afinfo->udp_table;
2075 return err; 2055 return err;
2076} 2056}
2077EXPORT_SYMBOL(udp_seq_open);
2078 2057
2079/* ------------------------------------------------------------------------ */ 2058/* ------------------------------------------------------------------------ */
2080int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo) 2059int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo)
@@ -2082,12 +2061,17 @@ int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo)
2082 struct proc_dir_entry *p; 2061 struct proc_dir_entry *p;
2083 int rc = 0; 2062 int rc = 0;
2084 2063
2064 afinfo->seq_fops.open = udp_seq_open;
2065 afinfo->seq_fops.read = seq_read;
2066 afinfo->seq_fops.llseek = seq_lseek;
2067 afinfo->seq_fops.release = seq_release_net;
2068
2085 afinfo->seq_ops.start = udp_seq_start; 2069 afinfo->seq_ops.start = udp_seq_start;
2086 afinfo->seq_ops.next = udp_seq_next; 2070 afinfo->seq_ops.next = udp_seq_next;
2087 afinfo->seq_ops.stop = udp_seq_stop; 2071 afinfo->seq_ops.stop = udp_seq_stop;
2088 2072
2089 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net, 2073 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2090 afinfo->seq_fops, afinfo); 2074 &afinfo->seq_fops, afinfo);
2091 if (!p) 2075 if (!p)
2092 rc = -ENOMEM; 2076 rc = -ENOMEM;
2093 return rc; 2077 return rc;
@@ -2115,9 +2099,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
2115 bucket, src, srcp, dest, destp, sp->sk_state, 2099 bucket, src, srcp, dest, destp, sp->sk_state,
2116 sk_wmem_alloc_get(sp), 2100 sk_wmem_alloc_get(sp),
2117 sk_rmem_alloc_get(sp), 2101 sk_rmem_alloc_get(sp),
2118 0, 0L, 0, 2102 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
2119 from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
2120 0, sock_i_ino(sp),
2121 atomic_read(&sp->sk_refcnt), sp, 2103 atomic_read(&sp->sk_refcnt), sp,
2122 atomic_read(&sp->sk_drops), len); 2104 atomic_read(&sp->sk_drops), len);
2123} 2105}
@@ -2139,20 +2121,14 @@ int udp4_seq_show(struct seq_file *seq, void *v)
2139 return 0; 2121 return 0;
2140} 2122}
2141 2123
2142static const struct file_operations udp_afinfo_seq_fops = {
2143 .owner = THIS_MODULE,
2144 .open = udp_seq_open,
2145 .read = seq_read,
2146 .llseek = seq_lseek,
2147 .release = seq_release_net
2148};
2149
2150/* ------------------------------------------------------------------------ */ 2124/* ------------------------------------------------------------------------ */
2151static struct udp_seq_afinfo udp4_seq_afinfo = { 2125static struct udp_seq_afinfo udp4_seq_afinfo = {
2152 .name = "udp", 2126 .name = "udp",
2153 .family = AF_INET, 2127 .family = AF_INET,
2154 .udp_table = &udp_table, 2128 .udp_table = &udp_table,
2155 .seq_fops = &udp_afinfo_seq_fops, 2129 .seq_fops = {
2130 .owner = THIS_MODULE,
2131 },
2156 .seq_ops = { 2132 .seq_ops = {
2157 .show = udp4_seq_show, 2133 .show = udp4_seq_show,
2158 }, 2134 },
@@ -2187,15 +2163,9 @@ void udp4_proc_exit(void)
2187static __initdata unsigned long uhash_entries; 2163static __initdata unsigned long uhash_entries;
2188static int __init set_uhash_entries(char *str) 2164static int __init set_uhash_entries(char *str)
2189{ 2165{
2190 ssize_t ret;
2191
2192 if (!str) 2166 if (!str)
2193 return 0; 2167 return 0;
2194 2168 uhash_entries = simple_strtoul(str, &str, 0);
2195 ret = kstrtoul(str, 0, &uhash_entries);
2196 if (ret)
2197 return 0;
2198
2199 if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN) 2169 if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN)
2200 uhash_entries = UDP_HTABLE_SIZE_MIN; 2170 uhash_entries = UDP_HTABLE_SIZE_MIN;
2201 return 1; 2171 return 1;
@@ -2206,16 +2176,26 @@ void __init udp_table_init(struct udp_table *table, const char *name)
2206{ 2176{
2207 unsigned int i; 2177 unsigned int i;
2208 2178
2209 table->hash = alloc_large_system_hash(name, 2179 if (!CONFIG_BASE_SMALL)
2210 2 * sizeof(struct udp_hslot), 2180 table->hash = alloc_large_system_hash(name,
2211 uhash_entries, 2181 2 * sizeof(struct udp_hslot),
2212 21, /* one slot per 2 MB */ 2182 uhash_entries,
2213 0, 2183 21, /* one slot per 2 MB */
2214 &table->log, 2184 0,
2215 &table->mask, 2185 &table->log,
2216 UDP_HTABLE_SIZE_MIN, 2186 &table->mask,
2217 64 * 1024); 2187 64 * 1024);
2218 2188 /*
2189 * Make sure hash table has the minimum size
2190 */
2191 if (CONFIG_BASE_SMALL || table->mask < UDP_HTABLE_SIZE_MIN - 1) {
2192 table->hash = kmalloc(UDP_HTABLE_SIZE_MIN *
2193 2 * sizeof(struct udp_hslot), GFP_KERNEL);
2194 if (!table->hash)
2195 panic(name);
2196 table->log = ilog2(UDP_HTABLE_SIZE_MIN);
2197 table->mask = UDP_HTABLE_SIZE_MIN - 1;
2198 }
2219 table->hash2 = table->hash + (table->mask + 1); 2199 table->hash2 = table->hash + (table->mask + 1);
2220 for (i = 0; i <= table->mask; i++) { 2200 for (i = 0; i <= table->mask; i++) {
2221 INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i); 2201 INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
@@ -2263,8 +2243,7 @@ int udp4_ufo_send_check(struct sk_buff *skb)
2263 return 0; 2243 return 0;
2264} 2244}
2265 2245
2266struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, 2246struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, u32 features)
2267 netdev_features_t features)
2268{ 2247{
2269 struct sk_buff *segs = ERR_PTR(-EINVAL); 2248 struct sk_buff *segs = ERR_PTR(-EINVAL);
2270 unsigned int mss; 2249 unsigned int mss;
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
deleted file mode 100644
index 505b30ad918..00000000000
--- a/net/ipv4/udp_diag.c
+++ /dev/null
@@ -1,216 +0,0 @@
1/*
2 * udp_diag.c Module for monitoring UDP transport protocols sockets.
3 *
4 * Authors: Pavel Emelyanov, <xemul@parallels.com>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12
13#include <linux/module.h>
14#include <linux/inet_diag.h>
15#include <linux/udp.h>
16#include <net/udp.h>
17#include <net/udplite.h>
18#include <linux/sock_diag.h>
19
20static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
21 struct netlink_callback *cb, struct inet_diag_req_v2 *req,
22 struct nlattr *bc)
23{
24 if (!inet_diag_bc_sk(bc, sk))
25 return 0;
26
27 return inet_sk_diag_fill(sk, NULL, skb, req,
28 sk_user_ns(NETLINK_CB(cb->skb).ssk),
29 NETLINK_CB(cb->skb).portid,
30 cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
31}
32
33static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
34 const struct nlmsghdr *nlh, struct inet_diag_req_v2 *req)
35{
36 int err = -EINVAL;
37 struct sock *sk;
38 struct sk_buff *rep;
39 struct net *net = sock_net(in_skb->sk);
40
41 if (req->sdiag_family == AF_INET)
42 sk = __udp4_lib_lookup(net,
43 req->id.idiag_src[0], req->id.idiag_sport,
44 req->id.idiag_dst[0], req->id.idiag_dport,
45 req->id.idiag_if, tbl);
46#if IS_ENABLED(CONFIG_IPV6)
47 else if (req->sdiag_family == AF_INET6)
48 sk = __udp6_lib_lookup(net,
49 (struct in6_addr *)req->id.idiag_src,
50 req->id.idiag_sport,
51 (struct in6_addr *)req->id.idiag_dst,
52 req->id.idiag_dport,
53 req->id.idiag_if, tbl);
54#endif
55 else
56 goto out_nosk;
57
58 err = -ENOENT;
59 if (sk == NULL)
60 goto out_nosk;
61
62 err = sock_diag_check_cookie(sk, req->id.idiag_cookie);
63 if (err)
64 goto out;
65
66 err = -ENOMEM;
67 rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) +
68 sizeof(struct inet_diag_meminfo) +
69 64)), GFP_KERNEL);
70 if (!rep)
71 goto out;
72
73 err = inet_sk_diag_fill(sk, NULL, rep, req,
74 sk_user_ns(NETLINK_CB(in_skb).ssk),
75 NETLINK_CB(in_skb).portid,
76 nlh->nlmsg_seq, 0, nlh);
77 if (err < 0) {
78 WARN_ON(err == -EMSGSIZE);
79 kfree_skb(rep);
80 goto out;
81 }
82 err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
83 MSG_DONTWAIT);
84 if (err > 0)
85 err = 0;
86out:
87 if (sk)
88 sock_put(sk);
89out_nosk:
90 return err;
91}
92
93static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlink_callback *cb,
94 struct inet_diag_req_v2 *r, struct nlattr *bc)
95{
96 int num, s_num, slot, s_slot;
97 struct net *net = sock_net(skb->sk);
98
99 s_slot = cb->args[0];
100 num = s_num = cb->args[1];
101
102 for (slot = s_slot; slot <= table->mask; num = s_num = 0, slot++) {
103 struct sock *sk;
104 struct hlist_nulls_node *node;
105 struct udp_hslot *hslot = &table->hash[slot];
106
107 if (hlist_nulls_empty(&hslot->head))
108 continue;
109
110 spin_lock_bh(&hslot->lock);
111 sk_nulls_for_each(sk, node, &hslot->head) {
112 struct inet_sock *inet = inet_sk(sk);
113
114 if (!net_eq(sock_net(sk), net))
115 continue;
116 if (num < s_num)
117 goto next;
118 if (!(r->idiag_states & (1 << sk->sk_state)))
119 goto next;
120 if (r->sdiag_family != AF_UNSPEC &&
121 sk->sk_family != r->sdiag_family)
122 goto next;
123 if (r->id.idiag_sport != inet->inet_sport &&
124 r->id.idiag_sport)
125 goto next;
126 if (r->id.idiag_dport != inet->inet_dport &&
127 r->id.idiag_dport)
128 goto next;
129
130 if (sk_diag_dump(sk, skb, cb, r, bc) < 0) {
131 spin_unlock_bh(&hslot->lock);
132 goto done;
133 }
134next:
135 num++;
136 }
137 spin_unlock_bh(&hslot->lock);
138 }
139done:
140 cb->args[0] = slot;
141 cb->args[1] = num;
142}
143
144static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
145 struct inet_diag_req_v2 *r, struct nlattr *bc)
146{
147 udp_dump(&udp_table, skb, cb, r, bc);
148}
149
150static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
151 struct inet_diag_req_v2 *req)
152{
153 return udp_dump_one(&udp_table, in_skb, nlh, req);
154}
155
156static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
157 void *info)
158{
159 r->idiag_rqueue = sk_rmem_alloc_get(sk);
160 r->idiag_wqueue = sk_wmem_alloc_get(sk);
161}
162
163static const struct inet_diag_handler udp_diag_handler = {
164 .dump = udp_diag_dump,
165 .dump_one = udp_diag_dump_one,
166 .idiag_get_info = udp_diag_get_info,
167 .idiag_type = IPPROTO_UDP,
168};
169
170static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
171 struct inet_diag_req_v2 *r, struct nlattr *bc)
172{
173 udp_dump(&udplite_table, skb, cb, r, bc);
174}
175
176static int udplite_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
177 struct inet_diag_req_v2 *req)
178{
179 return udp_dump_one(&udplite_table, in_skb, nlh, req);
180}
181
182static const struct inet_diag_handler udplite_diag_handler = {
183 .dump = udplite_diag_dump,
184 .dump_one = udplite_diag_dump_one,
185 .idiag_get_info = udp_diag_get_info,
186 .idiag_type = IPPROTO_UDPLITE,
187};
188
189static int __init udp_diag_init(void)
190{
191 int err;
192
193 err = inet_diag_register(&udp_diag_handler);
194 if (err)
195 goto out;
196 err = inet_diag_register(&udplite_diag_handler);
197 if (err)
198 goto out_lite;
199out:
200 return err;
201out_lite:
202 inet_diag_unregister(&udp_diag_handler);
203 goto out;
204}
205
206static void __exit udp_diag_exit(void)
207{
208 inet_diag_unregister(&udplite_diag_handler);
209 inet_diag_unregister(&udp_diag_handler);
210}
211
212module_init(udp_diag_init);
213module_exit(udp_diag_exit);
214MODULE_LICENSE("GPL");
215MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-17 /* AF_INET - IPPROTO_UDP */);
216MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-136 /* AF_INET - IPPROTO_UDPLITE */);
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index 5a681e298b9..aaad650d47d 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -25,7 +25,7 @@ extern int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
25 size_t len, int noblock, int flags, int *addr_len); 25 size_t len, int noblock, int flags, int *addr_len);
26extern int udp_sendpage(struct sock *sk, struct page *page, int offset, 26extern int udp_sendpage(struct sock *sk, struct page *page, int offset,
27 size_t size, int flags); 27 size_t size, int flags);
28extern int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); 28extern int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb);
29extern void udp_destroy_sock(struct sock *sk); 29extern void udp_destroy_sock(struct sock *sk);
30 30
31#ifdef CONFIG_PROC_FS 31#ifdef CONFIG_PROC_FS
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 2c46acd4cc3..aee9963f7f5 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -10,10 +10,6 @@
10 * as published by the Free Software Foundation; either version 10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version. 11 * 2 of the License, or (at your option) any later version.
12 */ 12 */
13
14#define pr_fmt(fmt) "UDPLite: " fmt
15
16#include <linux/export.h>
17#include "udp_impl.h" 13#include "udp_impl.h"
18 14
19struct udp_table udplite_table __read_mostly; 15struct udp_table udplite_table __read_mostly;
@@ -75,20 +71,13 @@ static struct inet_protosw udplite4_protosw = {
75}; 71};
76 72
77#ifdef CONFIG_PROC_FS 73#ifdef CONFIG_PROC_FS
78
79static const struct file_operations udplite_afinfo_seq_fops = {
80 .owner = THIS_MODULE,
81 .open = udp_seq_open,
82 .read = seq_read,
83 .llseek = seq_lseek,
84 .release = seq_release_net
85};
86
87static struct udp_seq_afinfo udplite4_seq_afinfo = { 74static struct udp_seq_afinfo udplite4_seq_afinfo = {
88 .name = "udplite", 75 .name = "udplite",
89 .family = AF_INET, 76 .family = AF_INET,
90 .udp_table = &udplite_table, 77 .udp_table = &udplite_table,
91 .seq_fops = &udplite_afinfo_seq_fops, 78 .seq_fops = {
79 .owner = THIS_MODULE,
80 },
92 .seq_ops = { 81 .seq_ops = {
93 .show = udp4_seq_show, 82 .show = udp4_seq_show,
94 }, 83 },
@@ -132,11 +121,11 @@ void __init udplite4_register(void)
132 inet_register_protosw(&udplite4_protosw); 121 inet_register_protosw(&udplite4_protosw);
133 122
134 if (udplite4_proc_init()) 123 if (udplite4_proc_init())
135 pr_err("%s: Cannot register /proc!\n", __func__); 124 printk(KERN_ERR "%s: Cannot register /proc!\n", __func__);
136 return; 125 return;
137 126
138out_unregister_proto: 127out_unregister_proto:
139 proto_unregister(&udplite_prot); 128 proto_unregister(&udplite_prot);
140out_register_err: 129out_register_err:
141 pr_crit("%s: Cannot add UDP-Lite protocol\n", __func__); 130 printk(KERN_CRIT "%s: Cannot add UDP-Lite protocol.\n", __func__);
142} 131}
diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c
index e3db3f91511..63418185f52 100644
--- a/net/ipv4/xfrm4_mode_beet.c
+++ b/net/ipv4/xfrm4_mode_beet.c
@@ -110,7 +110,10 @@ static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb)
110 110
111 skb_push(skb, sizeof(*iph)); 111 skb_push(skb, sizeof(*iph));
112 skb_reset_network_header(skb); 112 skb_reset_network_header(skb);
113 skb_mac_header_rebuild(skb); 113
114 memmove(skb->data - skb->mac_len, skb_mac_header(skb),
115 skb->mac_len);
116 skb_set_mac_header(skb, -skb->mac_len);
114 117
115 xfrm4_beet_make_header(skb); 118 xfrm4_beet_make_header(skb);
116 119
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index ddee0a099a2..534972e114a 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -15,65 +15,6 @@
15#include <net/ip.h> 15#include <net/ip.h>
16#include <net/xfrm.h> 16#include <net/xfrm.h>
17 17
18/* Informational hook. The decap is still done here. */
19static struct xfrm_tunnel __rcu *rcv_notify_handlers __read_mostly;
20static DEFINE_MUTEX(xfrm4_mode_tunnel_input_mutex);
21
22int xfrm4_mode_tunnel_input_register(struct xfrm_tunnel *handler)
23{
24 struct xfrm_tunnel __rcu **pprev;
25 struct xfrm_tunnel *t;
26 int ret = -EEXIST;
27 int priority = handler->priority;
28
29 mutex_lock(&xfrm4_mode_tunnel_input_mutex);
30
31 for (pprev = &rcv_notify_handlers;
32 (t = rcu_dereference_protected(*pprev,
33 lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL;
34 pprev = &t->next) {
35 if (t->priority > priority)
36 break;
37 if (t->priority == priority)
38 goto err;
39
40 }
41
42 handler->next = *pprev;
43 rcu_assign_pointer(*pprev, handler);
44
45 ret = 0;
46
47err:
48 mutex_unlock(&xfrm4_mode_tunnel_input_mutex);
49 return ret;
50}
51EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_register);
52
53int xfrm4_mode_tunnel_input_deregister(struct xfrm_tunnel *handler)
54{
55 struct xfrm_tunnel __rcu **pprev;
56 struct xfrm_tunnel *t;
57 int ret = -ENOENT;
58
59 mutex_lock(&xfrm4_mode_tunnel_input_mutex);
60 for (pprev = &rcv_notify_handlers;
61 (t = rcu_dereference_protected(*pprev,
62 lockdep_is_held(&xfrm4_mode_tunnel_input_mutex))) != NULL;
63 pprev = &t->next) {
64 if (t == handler) {
65 *pprev = handler->next;
66 ret = 0;
67 break;
68 }
69 }
70 mutex_unlock(&xfrm4_mode_tunnel_input_mutex);
71 synchronize_net();
72
73 return ret;
74}
75EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_deregister);
76
77static inline void ipip_ecn_decapsulate(struct sk_buff *skb) 18static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
78{ 19{
79 struct iphdr *inner_iph = ipip_hdr(skb); 20 struct iphdr *inner_iph = ipip_hdr(skb);
@@ -123,14 +64,9 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
123 return 0; 64 return 0;
124} 65}
125 66
126#define for_each_input_rcu(head, handler) \
127 for (handler = rcu_dereference(head); \
128 handler != NULL; \
129 handler = rcu_dereference(handler->next))
130
131static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) 67static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
132{ 68{
133 struct xfrm_tunnel *handler; 69 const unsigned char *old_mac;
134 int err = -EINVAL; 70 int err = -EINVAL;
135 71
136 if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP) 72 if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP)
@@ -139,9 +75,6 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
139 if (!pskb_may_pull(skb, sizeof(struct iphdr))) 75 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
140 goto out; 76 goto out;
141 77
142 for_each_input_rcu(rcv_notify_handlers, handler)
143 handler->handler(skb);
144
145 if (skb_cloned(skb) && 78 if (skb_cloned(skb) &&
146 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) 79 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
147 goto out; 80 goto out;
@@ -151,9 +84,10 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
151 if (!(x->props.flags & XFRM_STATE_NOECN)) 84 if (!(x->props.flags & XFRM_STATE_NOECN))
152 ipip_ecn_decapsulate(skb); 85 ipip_ecn_decapsulate(skb);
153 86
87 old_mac = skb_mac_header(skb);
88 skb_set_mac_header(skb, -skb->mac_len);
89 memmove(skb_mac_header(skb), old_mac, skb->mac_len);
154 skb_reset_network_header(skb); 90 skb_reset_network_header(skb);
155 skb_mac_header_rebuild(skb);
156
157 err = 0; 91 err = 0;
158 92
159out: 93out:
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 3be0ac2c192..a0b4c5da8d4 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -79,21 +79,30 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
79 struct rtable *rt = (struct rtable *)xdst->route; 79 struct rtable *rt = (struct rtable *)xdst->route;
80 const struct flowi4 *fl4 = &fl->u.ip4; 80 const struct flowi4 *fl4 = &fl->u.ip4;
81 81
82 xdst->u.rt.rt_key_dst = fl4->daddr;
83 xdst->u.rt.rt_key_src = fl4->saddr;
84 xdst->u.rt.rt_key_tos = fl4->flowi4_tos;
85 xdst->u.rt.rt_route_iif = fl4->flowi4_iif;
82 xdst->u.rt.rt_iif = fl4->flowi4_iif; 86 xdst->u.rt.rt_iif = fl4->flowi4_iif;
87 xdst->u.rt.rt_oif = fl4->flowi4_oif;
88 xdst->u.rt.rt_mark = fl4->flowi4_mark;
83 89
84 xdst->u.dst.dev = dev; 90 xdst->u.dst.dev = dev;
85 dev_hold(dev); 91 dev_hold(dev);
86 92
93 xdst->u.rt.peer = rt->peer;
94 if (rt->peer)
95 atomic_inc(&rt->peer->refcnt);
96
87 /* Sheit... I remember I did this right. Apparently, 97 /* Sheit... I remember I did this right. Apparently,
88 * it was magically lost, so this code needs audit */ 98 * it was magically lost, so this code needs audit */
89 xdst->u.rt.rt_is_input = rt->rt_is_input;
90 xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | 99 xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
91 RTCF_LOCAL); 100 RTCF_LOCAL);
92 xdst->u.rt.rt_type = rt->rt_type; 101 xdst->u.rt.rt_type = rt->rt_type;
102 xdst->u.rt.rt_src = rt->rt_src;
103 xdst->u.rt.rt_dst = rt->rt_dst;
93 xdst->u.rt.rt_gateway = rt->rt_gateway; 104 xdst->u.rt.rt_gateway = rt->rt_gateway;
94 xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway; 105 xdst->u.rt.rt_spec_dst = rt->rt_spec_dst;
95 xdst->u.rt.rt_pmtu = rt->rt_pmtu;
96 INIT_LIST_HEAD(&xdst->u.rt.rt_uncached);
97 106
98 return 0; 107 return 0;
99} 108}
@@ -143,7 +152,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
143 152
144 case IPPROTO_AH: 153 case IPPROTO_AH:
145 if (pskb_may_pull(skb, xprth + 8 - skb->data)) { 154 if (pskb_may_pull(skb, xprth + 8 - skb->data)) {
146 __be32 *ah_hdr = (__be32 *)xprth; 155 __be32 *ah_hdr = (__be32*)xprth;
147 156
148 fl4->fl4_ipsec_spi = ah_hdr[1]; 157 fl4->fl4_ipsec_spi = ah_hdr[1];
149 } 158 }
@@ -189,22 +198,12 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops)
189 return (dst_entries_get_slow(ops) > ops->gc_thresh * 2); 198 return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
190} 199}
191 200
192static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk, 201static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu)
193 struct sk_buff *skb, u32 mtu)
194{
195 struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
196 struct dst_entry *path = xdst->route;
197
198 path->ops->update_pmtu(path, sk, skb, mtu);
199}
200
201static void xfrm4_redirect(struct dst_entry *dst, struct sock *sk,
202 struct sk_buff *skb)
203{ 202{
204 struct xfrm_dst *xdst = (struct xfrm_dst *)dst; 203 struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
205 struct dst_entry *path = xdst->route; 204 struct dst_entry *path = xdst->route;
206 205
207 path->ops->redirect(path, sk, skb); 206 path->ops->update_pmtu(path, mtu);
208} 207}
209 208
210static void xfrm4_dst_destroy(struct dst_entry *dst) 209static void xfrm4_dst_destroy(struct dst_entry *dst)
@@ -213,6 +212,9 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
213 212
214 dst_destroy_metrics_generic(dst); 213 dst_destroy_metrics_generic(dst);
215 214
215 if (likely(xdst->u.rt.peer))
216 inet_putpeer(xdst->u.rt.peer);
217
216 xfrm_dst_destroy(xdst); 218 xfrm_dst_destroy(xdst);
217} 219}
218 220
@@ -230,7 +232,6 @@ static struct dst_ops xfrm4_dst_ops = {
230 .protocol = cpu_to_be16(ETH_P_IP), 232 .protocol = cpu_to_be16(ETH_P_IP),
231 .gc = xfrm4_garbage_collect, 233 .gc = xfrm4_garbage_collect,
232 .update_pmtu = xfrm4_update_pmtu, 234 .update_pmtu = xfrm4_update_pmtu,
233 .redirect = xfrm4_redirect,
234 .cow_metrics = dst_cow_metrics_generic, 235 .cow_metrics = dst_cow_metrics_generic,
235 .destroy = xfrm4_dst_destroy, 236 .destroy = xfrm4_dst_destroy,
236 .ifdown = xfrm4_dst_ifdown, 237 .ifdown = xfrm4_dst_ifdown,
@@ -279,15 +280,26 @@ static void __exit xfrm4_policy_fini(void)
279 xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo); 280 xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo);
280} 281}
281 282
282void __init xfrm4_init(void) 283void __init xfrm4_init(int rt_max_size)
283{ 284{
285 /*
286 * Select a default value for the gc_thresh based on the main route
287 * table hash size. It seems to me the worst case scenario is when
288 * we have ipsec operating in transport mode, in which we create a
289 * dst_entry per socket. The xfrm gc algorithm starts trying to remove
290 * entries at gc_thresh, and prevents new allocations as 2*gc_thresh
291 * so lets set an initial xfrm gc_thresh value at the rt_max_size/2.
292 * That will let us store an ipsec connection per route table entry,
293 * and start cleaning when were 1/2 full
294 */
295 xfrm4_dst_ops.gc_thresh = rt_max_size/2;
284 dst_entries_init(&xfrm4_dst_ops); 296 dst_entries_init(&xfrm4_dst_ops);
285 297
286 xfrm4_state_init(); 298 xfrm4_state_init();
287 xfrm4_policy_init(); 299 xfrm4_policy_init();
288#ifdef CONFIG_SYSCTL 300#ifdef CONFIG_SYSCTL
289 sysctl_hdr = register_net_sysctl(&init_net, "net/ipv4", 301 sysctl_hdr = register_net_sysctl_table(&init_net, net_ipv4_ctl_path,
290 xfrm4_policy_table); 302 xfrm4_policy_table);
291#endif 303#endif
292} 304}
293 305
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 9258e751bab..d9ac0a0058b 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -12,7 +12,6 @@
12#include <linux/pfkeyv2.h> 12#include <linux/pfkeyv2.h>
13#include <linux/ipsec.h> 13#include <linux/ipsec.h>
14#include <linux/netfilter_ipv4.h> 14#include <linux/netfilter_ipv4.h>
15#include <linux/export.h>
16 15
17static int xfrm4_init_flags(struct xfrm_state *x) 16static int xfrm4_init_flags(struct xfrm_state *x)
18{ 17{
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index 05a5df2febc..82806455e85 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -3,8 +3,6 @@
3 * Copyright (C) 2003 David S. Miller (davem@redhat.com) 3 * Copyright (C) 2003 David S. Miller (davem@redhat.com)
4 */ 4 */
5 5
6#define pr_fmt(fmt) "IPsec: " fmt
7
8#include <linux/skbuff.h> 6#include <linux/skbuff.h>
9#include <linux/module.h> 7#include <linux/module.h>
10#include <linux/mutex.h> 8#include <linux/mutex.h>
@@ -66,7 +64,7 @@ static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = {
66 .priority = 2, 64 .priority = 2,
67}; 65};
68 66
69#if IS_ENABLED(CONFIG_IPV6) 67#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
70static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = { 68static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = {
71 .handler = xfrm_tunnel_rcv, 69 .handler = xfrm_tunnel_rcv,
72 .err_handler = xfrm_tunnel_err, 70 .err_handler = xfrm_tunnel_err,
@@ -77,18 +75,18 @@ static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = {
77static int __init ipip_init(void) 75static int __init ipip_init(void)
78{ 76{
79 if (xfrm_register_type(&ipip_type, AF_INET) < 0) { 77 if (xfrm_register_type(&ipip_type, AF_INET) < 0) {
80 pr_info("%s: can't add xfrm type\n", __func__); 78 printk(KERN_INFO "ipip init: can't add xfrm type\n");
81 return -EAGAIN; 79 return -EAGAIN;
82 } 80 }
83 81
84 if (xfrm4_tunnel_register(&xfrm_tunnel_handler, AF_INET)) { 82 if (xfrm4_tunnel_register(&xfrm_tunnel_handler, AF_INET)) {
85 pr_info("%s: can't add xfrm handler for AF_INET\n", __func__); 83 printk(KERN_INFO "ipip init: can't add xfrm handler for AF_INET\n");
86 xfrm_unregister_type(&ipip_type, AF_INET); 84 xfrm_unregister_type(&ipip_type, AF_INET);
87 return -EAGAIN; 85 return -EAGAIN;
88 } 86 }
89#if IS_ENABLED(CONFIG_IPV6) 87#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
90 if (xfrm4_tunnel_register(&xfrm64_tunnel_handler, AF_INET6)) { 88 if (xfrm4_tunnel_register(&xfrm64_tunnel_handler, AF_INET6)) {
91 pr_info("%s: can't add xfrm handler for AF_INET6\n", __func__); 89 printk(KERN_INFO "ipip init: can't add xfrm handler for AF_INET6\n");
92 xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET); 90 xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET);
93 xfrm_unregister_type(&ipip_type, AF_INET); 91 xfrm_unregister_type(&ipip_type, AF_INET);
94 return -EAGAIN; 92 return -EAGAIN;
@@ -99,16 +97,14 @@ static int __init ipip_init(void)
99 97
100static void __exit ipip_fini(void) 98static void __exit ipip_fini(void)
101{ 99{
102#if IS_ENABLED(CONFIG_IPV6) 100#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
103 if (xfrm4_tunnel_deregister(&xfrm64_tunnel_handler, AF_INET6)) 101 if (xfrm4_tunnel_deregister(&xfrm64_tunnel_handler, AF_INET6))
104 pr_info("%s: can't remove xfrm handler for AF_INET6\n", 102 printk(KERN_INFO "ipip close: can't remove xfrm handler for AF_INET6\n");
105 __func__);
106#endif 103#endif
107 if (xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET)) 104 if (xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET))
108 pr_info("%s: can't remove xfrm handler for AF_INET\n", 105 printk(KERN_INFO "ipip close: can't remove xfrm handler for AF_INET\n");
109 __func__);
110 if (xfrm_unregister_type(&ipip_type, AF_INET) < 0) 106 if (xfrm_unregister_type(&ipip_type, AF_INET) < 0)
111 pr_info("%s: can't remove xfrm type\n", __func__); 107 printk(KERN_INFO "ipip close: can't remove xfrm type\n");
112} 108}
113 109
114module_init(ipip_init); 110module_init(ipip_init);